In [None]:
import nltk
from nltk.stem import PorterStemmer
from os import getcwd

nltk.download('twitter_samples')
nltk.download('stopwords')

filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

import numpy as np
import pandas as pd
import re
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string

In [None]:

#preprocesamiento del tweet
def process_tweet(tweet):

    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


In [None]:
#conteo de fecuencias por tipo de tweet

def build_freqs(tweets, ys):

    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y) #colocamos cada token clasificado por clase
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs



In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [None]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [None]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [None]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [None]:
freqs = build_freqs(train_x, train_y)

In [None]:
#word embeddings using Gensim library
from gensim.models import Word2Vec, KeyedVectors
import nltk

model=Word2Vec(train_x,  window=5, min_count=1, workers=4)




In [None]:
model.vector_size
embeddingsSize=model.vector_size

In [None]:
embeddingsSize

100

In [None]:
def getVectors(dataset):
  singleDataItemEmbedding=np.zeros(embeddingsSize)
  vectors=[]
  for dataItem in dataset:
    wordCount=0
    for word in dataItem:
      if word in model.wv.index_to_key:
        singleDataItemEmbedding=singleDataItemEmbedding+model.wv[word]
        wordCount=wordCount+1

    singleDataItemEmbedding=singleDataItemEmbedding/wordCount
    vectors.append(singleDataItemEmbedding)
  return vectors

trainVectors=getVectors(train_x)
testVectors=getVectors(test_x)


In [None]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

def printResults(y_true, y_predicted):
  print("Accuracy= ", accuracy_score(y_true, y_predicted))

  columns=['false', 'true']


  cm = confusion_matrix(y_true, y_predicted)

  precision, recall, fscore, support = score(y_true, y_predicted)

  print('###########################################')
  print('precision:'.format(precision))
  print('recall: {}'.format(recall))
  print('fscore: {}'.format(fscore))
  print('support: {}'.format(support))
  print('###########################################')
  print('confusion matrix')
  print(cm)

  print('Macro F1 ',f1_score(y_true, y_predicted, average='macro'))

  print('Micro F1 ', f1_score(y_true, y_predicted, average='micro'))



In [None]:
#naive bayes
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
clfNB = MultinomialNB()

scaler = MinMaxScaler()
scaledTrainX= scaler.fit_transform(trainVectors)
scaledTestX = scaler.fit_transform(testVectors)
clfNB.fit(scaledTrainX, train_y)

#test naive bayes accuracy
testLabelsPredicted=list(clfNB.predict(scaledTestX))

#print results
print("NAIVE BAYES CLASSIFIER")
printResults(testLabelsPredicted, test_y)

NAIVE BAYES CLASSIFIER
Accuracy=  0.665
###########################################
precision:
recall: [0.65804598 0.67259414]
fscore: [0.67221135 0.65746421]
support: [1044  956]
###########################################
confusion matrix
[[687 357]
 [313 643]]
Macro F1  0.6648377814862394
Micro F1  0.665


  y = column_or_1d(y, warn=True)


In [None]:
from sklearn.neural_network import MLPClassifier

clfMLP = MLPClassifier(hidden_layer_sizes=(10, 10, 10))
clfMLP.fit(trainVectors, train_y)

testLabelsPredicted=list(clfMLP.predict(testVectors))

#print results
print("NEURAL NETWORK CLASSIFIER")
printResults(testLabelsPredicted, test_y)


  y = column_or_1d(y, warn=True)


NEURAL NETWORK CLASSIFIER
Accuracy=  0.9635
###########################################
precision:
recall: [0.96396396 0.96303696]
fscore: [0.96348174 0.96351824]
support: [ 999 1001]
###########################################
confusion matrix
[[963  36]
 [ 37 964]]
Macro F1  0.9634999908749977
Micro F1  0.9635


In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

clfRF = RandomForestClassifier(n_estimators = 1000)
# Train the model on training data
clfRF.fit(trainVectors, train_y);

testLabelsPredicted=list(clfRF.predict(testVectors))

#print results
print("Random Forest CLASSIFIER")
printResults(testLabelsPredicted, test_y)

  clfRF.fit(trainVectors, train_y);


Random Forest CLASSIFIER
Accuracy=  0.8375
###########################################
precision:
recall: [0.84473953 0.83055828]
fscore: [0.83577564 0.83918852]
support: [ 979 1021]
###########################################
confusion matrix
[[827 152]
 [173 848]]
Macro F1  0.8374820823995845
Micro F1  0.8375


In [None]:
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier

clfKNN = KNeighborsClassifier(n_neighbors=3)

# Train the model on training data
clfKNN.fit(trainVectors, train_y);

testLabelsPredicted=list(clfKNN.predict(testVectors))

#print results
print("RESULTS OF KNN Classifier")
printResults(testLabelsPredicted, test_y)

  return self._fit(X, y)


RESULTS OF KNN Classifier
Accuracy=  0.753
###########################################
precision:
recall: [0.75401606 0.75199203]
fscore: [0.75250501 0.75349301]
support: [ 996 1004]
###########################################
confusion matrix
[[751 245]
 [249 755]]
Macro F1  0.7529990119960481
Micro F1  0.753


In [None]:
#SVC classifier
from sklearn import svm

SVCC = svm.SVC()

# Train the model on training data
SVCC.fit(trainVectors, train_y);

testLabelsPredicted=list(SVCC.predict(testVectors))

#print results
print("RESULTS OF SVC Classifier")
printResults(testLabelsPredicted, test_y)

  y = column_or_1d(y, warn=True)


RESULTS OF SVC Classifier
Accuracy=  0.9095
###########################################
precision:
recall: [0.9427027  0.88093023]
fscore: [0.90597403 0.91277108]
support: [ 925 1075]
###########################################
confusion matrix
[[872  53]
 [128 947]]
Macro F1  0.9093725551556877
Micro F1  0.9095
