In [1]:
import csv
import pandas as pd
import numpy as np
from time import time
import pickle

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score,recall_score, precision_score, classification_report


from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv("datasets/clean_sent_160k_train.csv",low_memory=False,error_bad_lines=False)

In [3]:
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596142 entries, 0 to 1596141
Data columns (total 2 columns):
sentiment    1596142 non-null int64
text         1596142 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [4]:
train, test = train_test_split(df, test_size=0.2, random_state=1)
x_train = train['text'].values
x_test = test['text'].values
y_train = train['sentiment']
y_test = test['sentiment']

In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
added = ['.',',','-',';',':','--','\"','(',')', '\'s','?','n\'t', '<', '>',
         '``', '\'\'', 'I', 'i', 'a', 'A', '..', '...', 'i\'m', 'I\'m']
stop_words.extend(added)

[nltk_data] Downloading package stopwords to /home/aveek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Note:
 Training the models or fitting the vectorizer may take a long time, so they have been pickled and stored as "*.sav" files which can be loaded using pickle again without training. The vectorizer was about 250MB, so it has been compressed and stored as ".xz" 

In [0]:
vectorizer = TfidfVectorizer(stop_words=None , max_features=100000, ngram_range=(1,3))
train_vectors = vectorizer.fit_transform(train['text'])
test_vectors = vectorizer.transform(test['text'])

In [6]:
# from joblib import dump, load

# filename = 'models/tfidf.xz'
# dump(vectorizer, filename)

# pickle.dump(vectorizer, open(filename, 'wb'))

# vectorizer = pickle.load(open('tfidf.sav', 'rb'))
# vectorizer



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

# Linear SVC with L1-based feature selection

In [42]:
Linear_SVC = LinearSVC(penalty="l1", dual=False)
Linear_SVC.fit(train_vectors, train['sentiment'])

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)

In [43]:
prediction_linear = Linear_SVC.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_svc = accuracy_score(test['sentiment'], prediction_linear)

print(report)
print("accuracy:",acc_svc)

              precision    recall  f1-score   support

           0       0.83      0.80      0.81    159531
           1       0.81      0.83      0.82    159698

   micro avg       0.82      0.82      0.82    319229
   macro avg       0.82      0.82      0.82    319229
weighted avg       0.82      0.82      0.82    319229

accuracy:


In [0]:
filename = 'models/linear_svc.sav'
pickle.dump(Linear_SVC, open(filename, 'wb'))

# Logistic Regression

In [None]:
Linear_regression = LogisticRegression()
Linear_regression.fit(train_vectors, train['sentiment'])

In [None]:
prediction_linear = Linear_regression.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_reg = accuracy_score(test['sentiment'], prediction_linear)
print(report)
print("accuracy:", acc_reg)

In [0]:
filename = 'models/linear_regression.sav'
pickle.dump(Linear_regression, open(filename, 'wb'))

# Multinomial Naive Bayes

In [48]:
classifier_NB = MultinomialNB()
classifier_NB.fit(train_vectors, train['sentiment'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
prediction_linear = classifier_NB.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_nb = accuracy_score(test['sentiment'], prediction_linear)
print(report)
print("accuracy:",acc_nb)

              precision    recall  f1-score   support

           0       0.80      0.80      0.80    159531
           1       0.80      0.79      0.80    159698

   micro avg       0.80      0.80      0.80    319229
   macro avg       0.80      0.80      0.80    319229
weighted avg       0.80      0.80      0.80    319229

accuracy: 0.7980759893368083


In [0]:
filename = 'models/multinomial_NB.sav'
pickle.dump(classifier_NB, open(filename, 'wb'))

# LSTM

In [0]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout


### We have to reduce the number of rows, otherwise it will take hours to train

In [52]:
small_df = df[::20]
small_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79808 entries, 0 to 1596140
Data columns (total 2 columns):
sentiment    79808 non-null int64
text         79808 non-null object
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


In [53]:
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

extra_clean = []

for line in small_df['text']:
  tweet = re.sub("[^a-zA-Z]", " ", line)
  tweet = [wordnet_lemmatizer.lemmatize(x) for x in tweet.split(" ") if
             x not in stop_words and len(x) > 2]
  
  extra_clean.append(" ".join(tweet))
  
small_df['text'] = extra_clean
small_df = small_df.drop(small_df[small_df['text'] == ''].index)
small_df.info()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
<class 'pandas.core.frame.DataFrame'>
Int64Index: 79480 entries, 0 to 1596140
Data columns (total 2 columns):
sentiment    79480 non-null int64
text         79480 non-null object
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [54]:
vocab = []
for x in small_df['text']:
    for word in x.split(' '):
        vocab.append(word)
print(len(set(vocab)), len(vocab))

vocab_size = len(set(vocab)) +500

49050 527569


In [55]:
print('Maximum review length: {}'.format(
len(max((small_df['text']), key=len))))

print('Minimum review length: {}'.format(
len(min((small_df['text']), key=len))))

Maximum review length: 123
Minimum review length: 2


In [0]:
tokenizer = Tokenizer(num_words=20000)

max_words = 20

tokenizer.fit_on_texts(small_df['text'].values)
X = tokenizer.texts_to_sequences(small_df['text'].values)
X = pad_sequences(X, maxlen=max_words)

In [0]:
# embed_dim = 32
# lstm_out = 100
# batch_size= 80


embed_dim = 32
lstm_out = 32
batch_size= 128

# #Buidling the LSTM network

model = Sequential()
model.add(Embedding(20000, embed_dim, input_length = max_words))
model.add(Dropout(0.5))
model.add(LSTM(lstm_out))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])



In [0]:
Y = small_df['sentiment']
X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.20, random_state = 2)

In [59]:

model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size)

Epoch 1/1


<keras.callbacks.History at 0x7f31a362dc50>

In [60]:
score,acc = model.evaluate(X_valid, Y_valid, batch_size = batch_size, verbose = 0)
print("Logloss score: %.2f" % (score))
print("Accuracy: %.5f" % acc)

Logloss score: 0.51
Accuracy: 0.75384


In [0]:
filename = 'models/LSTM.sav'
pickle.dump(model, open(filename, 'wb'))

In [66]:
print("Accuracy- ")
print("SVC: %.2f" % acc_svc)
print("Linear Reg: %.2f" % acc_reg)
print("Naive Bayes: %.2f" % acc_nb)
print("LSTM: %.2f" % acc)

Accuracy- 
SVC: 0.82
Linear Reg: 0.82
Naive Bayes: 0.80
LSTM: 0.75


# Observation

Of the 4 methods we used logistic regression gives us the best accuracy.