In [1]:
import csv
import pandas as pd
import numpy as np
from time import time
import pickle

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score,recall_score, precision_score, classification_report


from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

# Training 4 different models for sentiment analysis

In [2]:
df = pd.read_csv("./datasets/clean_sent_160k_train.csv",low_memory=False,error_bad_lines=False)

### Drop all rows with NaN

In [3]:
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1592046 entries, 0 to 1592045
Data columns (total 2 columns):
sentiment    1592046 non-null int64
text         1592046 non-null object
dtypes: int64(1), object(1)
memory usage: 24.3+ MB


### Split into testing and training data

In [4]:
train, test = train_test_split(df, test_size=0.2, random_state=1)
x_train = train['text'].values
x_test = test['text'].values
y_train = train['sentiment']
y_test = test['sentiment']

In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
added = ['.',',','-',';',':','--','\"','(',')', '\'s','?','n\'t', '<', '>',
         '``', '\'\'', 'I', 'i', 'a', 'A', '..', '...', 'i\'m', 'I\'m']
stop_words.extend(added)

[nltk_data] Downloading package stopwords to /home/aveek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Note:
 Training the models or fitting the vectorizer may take a long time, so they have been pickled and stored as "*.sav" files which can be loaded using pickle again without training. The vectorizer was about 250MB, so it has been compressed and stored as ".xz" 

### Use tfidf with trigrams to vectorize the text

In [6]:
# vectorizer = TfidfVectorizer(stop_words=None , max_features=100000, ngram_range=(1,3))
# train_vectors = vectorizer.fit_transform(train['text'])
# test_vectors = vectorizer.transform(test['text'])

### Save or load vectorizer

In [None]:
from joblib import dump, load


# Uncomment the next line to save the created vectorizer
# filename = 'models/tfidf.xz'
# dump(vectorizer, filename)

# uncomment the next line to load the saved vectorizer
vectorizer = load('./models/tfidf.xz')
train_vectors = vectorizer.fit_transform(train['text'])
test_vectors = vectorizer.transform(test['text'])

# Training the models

## 1. Linear SVC with L1-based feature selection

In [None]:
Linear_SVC = LinearSVC(penalty="l1", dual=False)
Linear_SVC.fit(train_vectors, train['sentiment'])

In [None]:
prediction_linear = Linear_SVC.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_svc = accuracy_score(test['sentiment'], prediction_linear)

print(report)
print("accuracy:",acc_svc)

### Save the model for future use

In [None]:
# filename = './models/linear_svc.sav'
# pickle.dump(Linear_SVC, open(filename, 'wb'))

## 2. Logistic Regression

In [None]:
Linear_regression = LogisticRegression()
Linear_regression.fit(train_vectors, train['sentiment'])

In [None]:
prediction_linear = Linear_regression.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_reg = accuracy_score(test['sentiment'], prediction_linear)
print(report)
print("accuracy:", acc_reg)

In [None]:
# filename = './models/linear_regression.sav'
# pickle.dump(Linear_regression, open(filename, 'wb'))

## 3. Multinomial Naive Bayes

In [None]:
classifier_NB = MultinomialNB()
classifier_NB.fit(train_vectors, train['sentiment'])

In [None]:
prediction_linear = classifier_NB.predict(test_vectors)


report = classification_report(test['sentiment'], prediction_linear)
acc_nb = accuracy_score(test['sentiment'], prediction_linear)
print(report)
print("accuracy:",acc_nb)

In [None]:
# filename = './models/multinomial_NB.sav'
# pickle.dump(classifier_NB, open(filename, 'wb'))

## 4. LSTM

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout


### Reducing the number of rows, to make training time reasonable

In [None]:
small_df = df[::20]
small_df.info()

### Clean the dataset

In [None]:
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

extra_clean = []

for line in small_df['text']:
  tweet = re.sub("[^a-zA-Z]", " ", line)
  tweet = [wordnet_lemmatizer.lemmatize(x) for x in tweet.split(" ") if
             x not in stop_words and len(x) > 2]
  
  extra_clean.append(" ".join(tweet))
  
small_df['text'] = extra_clean
small_df = small_df.drop(small_df[small_df['text'] == ''].index)
small_df.info()

### Get vocabulary size

In [None]:
vocab = []
for x in small_df['text']:
    for word in x.split(' '):
        vocab.append(word)
print(len(set(vocab)), len(vocab))

vocab_size = len(set(vocab)) +500

In [None]:
print('Maximum review length: {}'.format(
len(max((small_df['text']), key=len))))

print('Minimum review length: {}'.format(
len(min((small_df['text']), key=len))))

### Tokenize and pad the input sequences for the LSTM

In [None]:
tokenizer = Tokenizer(num_words=20000)

max_words = 20

tokenizer.fit_on_texts(small_df['text'].values)
X = tokenizer.texts_to_sequences(small_df['text'].values)
X = pad_sequences(X, maxlen=max_words)

### Create the LSTM

In [None]:

embed_dim = 32
lstm_out = 32
batch_size= 128

# #Buidling the LSTM network

model = Sequential()
model.add(Embedding(20000, embed_dim, input_length = max_words))
model.add(Dropout(0.5))
model.add(LSTM(lstm_out))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])



In [None]:
Y = small_df['sentiment']
X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.20, random_state = 2)

### Train the model

In [None]:

model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size)

In [None]:
score,acc = model.evaluate(X_valid, Y_valid, batch_size = batch_size, verbose = 0)
print("Logloss score: %.2f" % (score))
print("Accuracy: %.5f" % acc)

In [None]:
# filename = './models/LSTM.sav'
# pickle.dump(model, open(filename, 'wb'))

### Compare the accuracy of the 4 methods

In [None]:
print("Accuracy- ")
print("SVC: %.3f" % acc_svc)
print("Linear Reg: %.3f" % acc_reg)
print("Naive Bayes: %.3f" % acc_nb)
print("LSTM: %.3f" % acc)

# Observation

### Of the 4 methods we used logistic regression gives us the best accuracy. With SVM close behind