In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
#import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
#from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

# Reading the input files

In [62]:
train = pd.read_csv('train.tsv', sep="\t")
test = pd.read_csv('test.tsv', sep="\t")
sub = pd.read_csv('sampleSubmission.csv', sep="\t")

In [63]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what is good for the goose,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is good for the goose,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for the goose,2


In [64]:
train.loc[train.SentenceId == 2]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
63,64,2,"This quiet , introspective and entertaining independent is worth seeking .",4
64,65,2,"This quiet , introspective and entertaining independent",3
65,66,2,This,2
66,67,2,"quiet , introspective and entertaining independent",4
67,68,2,"quiet , introspective and entertaining",3
68,69,2,quiet,2
69,70,2,", introspective and entertaining",3
70,71,2,introspective and entertaining,3
71,72,2,introspective and,3
72,73,2,introspective,2


In [65]:
test.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine effort .
1,156062,8545,An intermittently pleasing but mostly routine effort
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
5,156066,8545,intermittently pleasing but
6,156067,8545,intermittently pleasing
7,156068,8545,intermittently
8,156069,8545,pleasing
9,156070,8545,but


In [66]:
train['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

# Cleaning the data

In [67]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
from string import punctuation
import re

In [68]:
def clean_review(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus

In [69]:
train['clean_review']=clean_review(train.Phrase.values)

In [70]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of a story
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series


# Balancing the data by Resampling

In [71]:
from sklearn.utils import resample
train_2 = train[train['Sentiment']==2]
train_1 = train[train['Sentiment']==1]
train_3 = train[train['Sentiment']==3]
train_4 = train[train['Sentiment']==4]
train_5 = train[train['Sentiment']==0]
train_2_sample = resample(train_2,replace=True,n_samples=75000,random_state=123)
train_1_sample = resample(train_1,replace=True,n_samples=75000,random_state=123)
train_3_sample = resample(train_3,replace=True,n_samples=75000,random_state=123)
train_4_sample = resample(train_4,replace=True,n_samples=75000,random_state=123)
train_5_sample = resample(train_5,replace=True,n_samples=75000,random_state=123)

df_upsampled = pd.concat([train_2, train_1_sample,train_3_sample,train_4_sample,train_5_sample])

In [72]:
df_upsampled.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what is good for the goose,2,of escapade demonstrating the adage that what is good for the goose


In [73]:
test['clean_review']=clean_review(test.Phrase.values)
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,clean_review
0,156061,8545,An intermittently pleasing but mostly routine effort .,an intermittently pleasing but mostly routine effort
1,156062,8545,An intermittently pleasing but mostly routine effort,an intermittently pleasing but mostly routine effort
2,156063,8545,An,an
3,156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


# Data Processing for ML

In [74]:
text = ' '.join(df_upsampled.loc[df_upsampled.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]

In [75]:
Counter(text_trigrams).most_common(30)

[(('one', 'of', 'the'), 1644),
 (('of', 'the', 'year'), 832),
 (('of', 'the', 'best'), 677),
 (('of', 'the', 'most'), 612),
 (('is', 'one', 'of'), 407),
 (('One', 'of', 'the'), 370),
 ((',', 'and', 'the'), 333),
 (('the', 'year', "'s"), 326),
 (('It', "'s", 'a'), 323),
 (('the', 'edge', 'of'), 300),
 (('it', "'s", 'a'), 299),
 (('a', 'movie', 'that'), 297),
 (('of', 'your', 'seat'), 273),
 (('the', 'film', 'is'), 267),
 (('the', 'kind', 'of'), 267),
 (('.', 'is', 'a'), 264),
 (('the', 'film', "'s"), 264),
 (('as', 'one', 'of'), 254),
 ((',', 'the', 'film'), 253),
 (('edge', 'of', 'your'), 249),
 ((',', 'this', 'is'), 236),
 (('as', 'well', 'as'), 231),
 ((',', 'it', "'s"), 226),
 (('film', 'that', 'is'), 223),
 (('.', 'It', "'s"), 218),
 (('a', 'film', 'that'), 211),
 ((',', 'funny', ','), 208),
 (('some', 'of', 'the'), 206),
 (('year', "'s", 'best'), 188),
 (('a', 'solid', 'cast'), 178)]

In [76]:
tokenizer = TweetTokenizer()

In [77]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(df_upsampled['clean_review'].values) + list(test['clean_review'].values)
vectorizer.fit(full_text)
df_upsampled_vectorized = vectorizer.transform(df_upsampled['clean_review'])
test_vectorized = vectorizer.transform(test['clean_review'])
test1 = test['clean_review']

In [78]:
y = df_upsampled['Sentiment']

# Applying ML algorithm

In [79]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [80]:
%%time
ovr.fit(df_upsampled_vectorized, y)

CPU times: user 30.9 s, sys: 176 ms, total: 31.1 s
Wall time: 15.6 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [81]:
scores = cross_val_score(ovr, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 72.17%, std 0.42.


In [82]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 77.44%, std 0.46.
CPU times: user 731 ms, sys: 169 ms, total: 900 ms
Wall time: 51.2 s


In [83]:
%%time
model = MultinomialNB()
#model.fit(train_vectorized, y)
scores =  cross_val_score(model, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 59.50%, std 0.15.
CPU times: user 692 ms, sys: 177 ms, total: 869 ms
Wall time: 1.33 s


In [84]:
from keras.utils import to_categorical
X = df_upsampled['clean_review']
#test_set = test['clean review']
#Y = train['Sentiment']
Y = to_categorical(df_upsampled['Sentiment'].values)
print(Y)

[[0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


# splitting training set into training and validation set

In [85]:

from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=123)

In [86]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(284686,) (284686, 5)
(94896,) (94896, 5)


In [87]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Total number of words/features

In [88]:
all_words=' '.join(X_train)
all_words=word_tokenize(all_words)
#print(all_words)
dist=FreqDist(all_words)

num_unique_word=len(dist)
num_unique_word
#X_train.head()

13728

# Number of words for each phrase/text

In [89]:
r_len=[]
for text in X_train:
    word=word_tokenize(text)
  #  print(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

48

In [90]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=5

In [91]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# Tokenizing the words

In [92]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

X_test = tokenizer.texts_to_sequences(test1)
#X_test

In [68]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

# Sequence Padding

In [37]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
#print(X_train.shape,X_val.shape)
X_test

array([[   0,    0,    0, ...,  729,  778,  365],
       [   0,    0,    0, ...,  729,  778,  365],
       [   0,    0,    0, ...,    0,    0,   15],
       ...,
       [   0,    0,    0, ...,    2,  124, 8521],
       [   0,    0,    0, ...,    2,  124, 8521],
       [   0,    0,    0, ...,    0,  369, 1921]], dtype=int32)

In [38]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
import matplotlib.pyplot as plt

# Building the model---LSTM

In [40]:
model1=Sequential()
model1.add(Embedding(max_features,100,mask_zero=True))

model1.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(num_classes,activation='softmax'))


model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1372800   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 64)          42240     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 1,427,621
Trainable params: 1,427,621
Non-trainable params: 0
_________________________________________________________________


# Fitting the model

In [41]:

#%%time
model1.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=epochs, batch_size=batch_size, verbose=1)

Train on 284686 samples, validate on 94896 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f20cbc55cc0>

# Predicting the model

In [42]:
pred1=model1.predict_classes(X_test,verbose=1)



In [43]:
sub.Sentiment=pred1
sub.to_csv('sub1.csv',index=False)
#sub.head()

  """Entry point for launching an IPython kernel.


## CNN model

In [44]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [45]:
model2 = Sequential()

# Input / Embdedding
model2.add(Embedding(max_features, 150, input_length=max_words))

# CNN
model2.add(SpatialDropout1D(0.2))

model2.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))

model2.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))

model2.add(Flatten())

# Output layer
model2.add(Dense(5, activation='sigmoid'))

In [46]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Train on 284686 samples, validate on 94896 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f20c12f3ba8>

In [47]:
pred2=model2.predict_classes(X_test,verbose=1)
sub.Sentiment=pred2
sub.to_csv('sub2.csv',index=False)
#sub.head()



# CNN+GRU

In [48]:
model3= Sequential()
model3.add(Embedding(max_features,100,input_length=max_words))
model3.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.25))
model3.add(GRU(128,return_sequences=True))
model3.add(Dropout(0.3))
model3.add(Flatten())
model3.add(Dense(128,activation='relu'))
model3.add(Dropout(0.5))
model3.add(Dense(5,activation='softmax'))
model3.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 48, 100)           1372800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 24, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 24, 128)           74112     
_________________________________________________________________
dropout_2 (Dropout)          (None, 24, 128)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 3072)              0         
__________

In [49]:
%%time
model3.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=epochs, batch_size=batch_size, verbose=1)

Train on 284686 samples, validate on 94896 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 34min 18s, sys: 5min 35s, total: 39min 54s
Wall time: 12min 33s


<keras.callbacks.History at 0x7f20c12d5cf8>

In [50]:
pred3=model3.predict_classes(X_test,verbose=1)
sub.Sentiment=pred3
sub.to_csv('sub3.csv',index=False)
#sub.head()



# Model Testing - New Data

In [51]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [52]:
tokens = tokenizer.texts_to_sequences(texts)

In [53]:

tokens_pad = pad_sequences(tokens, maxlen=MAX_REVIEW_LEN)
tokens_pad.shape

(8, 48)

# Predictions using different models/algorithms

In [54]:
pred4=model1.predict_classes(tokens_pad,verbose=1)



In [55]:
print(pred4)

[4 4 2 2 1 0 1 0]


In [56]:
pred5=model2.predict_classes(tokens_pad,verbose=1)



In [57]:
print(pred5)

[4 4 3 2 1 0 1 0]


In [58]:
pred6=model3.predict_classes(tokens_pad,verbose=1)



In [59]:
print(pred6)

[4 4 2 2 1 0 1 0]
