In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
import sklearn
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score ,confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_words( tweets ):               
    tweets_onlyletters = re.sub("[^a-zA-Z]", " ",tweets) #Remove everything other than letters     
    words = tweets_onlyletters.lower().split() #Convert to lower case, split into individual words   
    #words = [w for w in words if len(w) > 2]
    for index in range(len(words)):
        words[index] = SnowballStemmer("english").stem(words[index])
    stops = set(stopwords.words("english"))  #Convert the stopwords to a set for improvised performance                 
    meaningful_words = [w for w in words if not w in stops]   #Removing stopwords
    return( " ".join( meaningful_words )) #Joining the words

In [3]:
column_names = ['polarity','id','date','query','user','text']
df = pd.read_csv("train.csv", encoding = "ISO-8859-1", header = None, names = column_names)
df1 = df.sample(frac=1).groupby('polarity', sort=False).head(80000)

In [4]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(df1["text"], df1["polarity"], test_size = 0.2)
X_train = np.array(X_train);
X_test = np.array(X_test);
Y_train = np.array(Y_train);
Y_test = np.array(Y_test);
cleanTweets_train = [] #To append processed headlines
cleanTweets_test = [] #To append processed headlines
number_Tweets_train = len(X_train) #Calculating the number of reviews
number_Tweets_test = len(X_test) #Calculating the number of reviews

In [5]:
for i in range(0,number_Tweets_train):
    cleanTweet = get_words(X_train[i]) #Processing the data and getting words with no special characters, numbers or html tags
    cleanTweets_train.append( cleanTweet )

In [6]:
for i in range(0,number_Tweets_test):
    cleanTweet = get_words(X_test[i]) #Processing the data and getting words with no special characters, numbers or html tags
    cleanTweets_test.append( cleanTweet )

In [7]:
vectorize = sklearn.feature_extraction.text.CountVectorizer(analyzer = "word", max_features = 4000,
                                                           stop_words = "english", 
                                                           min_df = 50, ngram_range = (1,4)) #max_features = 5000, ngram_range = (1,4),
bagOfWords_train = vectorize.fit_transform(cleanTweets_train)
X_train = bagOfWords_train.toarray()

bagOfWords_test = vectorize.transform(cleanTweets_test)
X_test = bagOfWords_test.toarray()

In [8]:
vocab = vectorize.get_feature_names()
nb = MultinomialNB()
nb.fit(X_train, Y_train)
print(nb.score(X_test, Y_test))
nb_predict = nb.predict(X_test)
cm_nb = confusion_matrix(Y_test,nb_predict)
print(cm_nb)

0.73878125
[[12287  3695]
 [ 4664 11354]]


In [9]:
logistic_Regression = LogisticRegression()
logistic_Regression.fit(X_train,Y_train)
Y_predict = logistic_Regression.predict(X_test)
print(accuracy_score(Y_test,Y_predict))
cm_lr = confusion_matrix(Y_test,Y_predict)
print(cm_lr)

0.75178125
[[11403  4579]
 [ 3364 12654]]


In [10]:
import pickle
with open('logistic_regression_model.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(logistic_Regression, f, pickle.HIGHEST_PROTOCOL)

In [11]:
with open('vectorizer.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(vectorize, f, pickle.HIGHEST_PROTOCOL)

In [12]:
with open('nb.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(nb, f, pickle.HIGHEST_PROTOCOL)

In [19]:
nb_tp = cm_nb[0,0]
nb_fp = cm_nb[0,1]
nb_fn = cm_nb[1,0]
nb_tn = cm_nb[1,1]
print("Precision of Naive Bayes method", (nb_tp/(nb_tp+nb_fp)))
print("Recall of Naive Bayes method", (nb_tp/(nb_tp+nb_fn)))

Precision of Naive Bayes method 0.768802402703
Recall of Naive Bayes method 0.724853990915


In [21]:
lr_tp = cm_lr[0,0]
lr_fp = cm_lr[0,1]
lr_fn = cm_lr[1,0]
lr_tn = cm_lr[1,1]
print("Precision of Logistic Regression method", (lr_tp/(lr_tp+lr_fp)))
print("Recall of Logistic Regression method", (lr_tp/(lr_tp+lr_fn)))

Precision of Logistic Regression method 0.713490176449
Recall of Logistic Regression method 0.772194758583
