In [1]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
data = pd.read_csv('train.tsv', sep = '\t')
data.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.21.2.


In [4]:
data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [5]:
lemma = WordNetLemmatizer()

In [6]:
def clean_data(data_column):
    revised_data = []
    for i in range(0,len(data_column)):
        review = str(data_column[i])
        review = re.sub('[^a-zA-Z]',' ', review)
        review = [lemma.lemmatize(y) for y in word_tokenize(review.lower())]
        review = ' '.join(review)
        revised_data.append(review)
    return revised_data

In [7]:
data['Revised_Phrase'] = clean_data(data.Phrase.values)
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Revised_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,a series of escapade demonstrating the adage t...
1,2,1,A series of escapades demonstrating the adage ...,2,a series of escapade demonstrating the adage t...
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series


In [8]:
from sklearn.utils import resample

In [9]:
data_1 = data[data['Sentiment'] == 1]
data_2 = data[data['Sentiment'] == 2]
data_3 = data[data['Sentiment'] == 3]
data_4 = data[data['Sentiment'] == 4]
data_5 = data[data['Sentiment'] == 0]

data_1_sample = resample(data_1, replace = True ,n_samples = 75000,
                        random_state = 123)
data_2_sample = resample(data_2, replace = True ,n_samples = 75000,
                        random_state = 123)
data_3_sample = resample(data_3, replace = True ,n_samples = 75000,
                        random_state = 123)
data_4_sample = resample(data_4, replace = True ,n_samples = 75000,
                        random_state = 123)
data_5_sample = resample(data_5, replace = True ,n_samples = 75000,
                        random_state = 123)

data_resampled = pd.concat([data_1, data_1_sample, data_2,
                               data_2_sample, data_3,
                               data_3_sample, data_4,
                               data_4_sample, data_5,
                               data_5_sample])

In [10]:
data_resampled.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Revised_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,a series of escapade demonstrating the adage t...
33,34,1,"the gander , some of which occasionally amuses...",1,the gander some of which occasionally amuses b...
47,48,1,but none of which amounts to much of a story,1,but none of which amount to much of a story
49,50,1,none of which amounts to much of a story,1,none of which amount to much of a story
81,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,even fan of ismail merchant s work i suspect w...


In [11]:
from nltk.util import ngrams
from nltk.tokenize import TweetTokenizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer 

In [12]:
text = ' '.join(data_resampled.loc[data_resampled.Sentiment == 4,
                                  'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
text_trigrams

[('This', 'quiet', ','),
 ('quiet', ',', 'introspective'),
 (',', 'introspective', 'and'),
 ('introspective', 'and', 'entertaining'),
 ('and', 'entertaining', 'independent'),
 ('entertaining', 'independent', 'is'),
 ('independent', 'is', 'worth'),
 ('is', 'worth', 'seeking'),
 ('worth', 'seeking', '.'),
 ('seeking', '.', 'quiet'),
 ('.', 'quiet', ','),
 ('quiet', ',', 'introspective'),
 (',', 'introspective', 'and'),
 ('introspective', 'and', 'entertaining'),
 ('and', 'entertaining', 'independent'),
 ('entertaining', 'independent', 'entertaining'),
 ('independent', 'entertaining', 'is'),
 ('entertaining', 'is', 'worth'),
 ('is', 'worth', 'seeking'),
 ('worth', 'seeking', 'A'),
 ('seeking', 'A', 'positively'),
 ('A', 'positively', 'thrilling'),
 ('positively', 'thrilling', 'combination'),
 ('thrilling', 'combination', 'of'),
 ('combination', 'of', 'ethnography'),
 ('of', 'ethnography', 'and'),
 ('ethnography', 'and', 'all'),
 ('and', 'all', 'the'),
 ('all', 'the', 'intrigue'),
 ('the', 

In [13]:
Counter(text_trigrams).most_common(30)

[(('one', 'of', 'the'), 1843),
 (('of', 'the', 'year'), 935),
 (('of', 'the', 'best'), 757),
 (('of', 'the', 'most'), 682),
 (('is', 'one', 'of'), 457),
 (('One', 'of', 'the'), 413),
 ((',', 'and', 'the'), 373),
 (('the', 'year', "'s"), 364),
 (('It', "'s", 'a'), 361),
 (('.', 'is', 'a'), 351),
 (('it', "'s", 'a'), 336),
 (('the', 'edge', 'of'), 334),
 (('a', 'movie', 'that'), 332),
 (('of', 'your', 'seat'), 306),
 (('the', 'kind', 'of'), 300),
 (('the', 'film', 'is'), 298),
 (('the', 'film', "'s"), 295),
 (('as', 'one', 'of'), 284),
 ((',', 'the', 'film'), 283),
 (('edge', 'of', 'your'), 278),
 ((',', 'this', 'is'), 267),
 (('as', 'well', 'as'), 258),
 ((',', 'it', "'s"), 253),
 (('film', 'that', 'is'), 253),
 (('a', 'film', 'that'), 238),
 (('.', 'It', "'s"), 234),
 ((',', 'funny', ','), 233),
 (('some', 'of', 'the'), 229),
 (('year', "'s", 'best'), 209),
 (('a', 'solid', 'cast'), 198)]

In [92]:
tokenizer = TweetTokenizer()
tokenizer.tokenize

<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x000002230596EE08>>

In [93]:
vectorizer = TfidfVectorizer(ngram_range = (1,2), tokenizer = tokenizer.tokenize)
full_text = list(data_resampled['Revised_Phrase'].values)
vectorizer.fit(full_text)
data_resampled_vectorized = vectorizer.transform(data_resampled['Revised_Phrase'])

y = data_resampled['Sentiment']
data_resampled_vectorized

<531060x90751 sparse matrix of type '<class 'numpy.float64'>'
	with 7836632 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [17]:
LogisReg = LogisticRegression()
ovr = OneVsRestClassifier(LogisReg)

In [18]:
%%time
ovr.fit(data_resampled_vectorized, y)



Wall time: 44.8 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [19]:
scores = cross_val_score(ovr, data_resampled_vectorized, y, scoring = 'accuracy',n_jobs = -1, cv = 3)
print('Cross-validation mean accuracy {0:.2f}%, std {1: .2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 76.15%, std  0.78.


In [55]:
from tensorflow.keras.utils import to_categorical
X = data_resampled['Revised_Phrase'].values
Y = to_categorical(data_resampled['Sentiment'].values)
X

array(['a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of a story',
       'the gander some of which occasionally amuses but none of which amount to much of a story',
       'but none of which amount to much of a story', ...,
       'waste viewer time',
       'much about the film including some of it casting is frustratingly unconvincing',
       'it s uninteresting'], dtype=object)

In [101]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=123)
X_train

array(['barney s idea about creation and identity do n t really seem all that profound at least by way of what can be gleaned from this three hour endurance test built around an hour s worth of actual material',
       'is more interesting lrb and funnier rrb', 'greatest mistake', ...,
       'to mention dragged down by a leaden closing act',
       'about otherwise dull subject',
       'may lack the pungent bite of it title but it s an enjoyable trifle nonetheless'],
      dtype=object)

In [100]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(398295, 48) (398295, 5)
(132765, 48) (132765, 5)


In [26]:
from nltk import FreqDist

In [27]:
all_words=' '.join(X_train)
all_words=word_tokenize(all_words)
#print(all_words)
dist=FreqDist(all_words)

num_unique_word=len(dist)
num_unique_word
#X_train.head()

13744

In [28]:
r_len=[]
for text in X_train:
    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

48

In [29]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=5

In [102]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

In [59]:
from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
#print(X_train.shape,X_val.shape)
X_train

array([[   0,    0,    0, ...,    4, 2260,  224],
       [   0,    0,    0, ...,    3,  979,   52],
       [   0,    0,    0, ...,    0,  824, 1697],
       ...,
       [   0,    0,    0, ..., 1501, 2401,  377],
       [   0,    0,    0, ...,  559,  202,  213],
       [   0,    0,    0, ...,  293, 2427, 5597]])

In [68]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [69]:
model1=Sequential()
model1.add(Embedding(max_features,100,mask_zero=True))

model1.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(num_classes,activation='softmax'))


model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         1374400   
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 64)          42240     
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 1,429,221
Trainable params: 1,429,221
Non-trainable params: 0
_________________________________________________________________


In [70]:
model1.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=epochs, batch_size=batch_size, verbose=1)

Train on 398295 samples, validate on 132765 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2235f56acc8>

In [71]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [72]:
tokens = tokenizer.texts_to_sequences(texts)
tokens

[[16, 12, 8, 1090, 49, 118, 35, 6, 165, 6, 8, 33, 45],
 [45, 12],
 [1020, 49, 35, 16, 12],
 [],
 [60, 49, 197, 2, 3936, 1576, 316, 16, 12, 187, 18, 45],
 [53, 12],
 [27, 2, 45, 12],
 [16, 12, 118, 66, 49, 94, 193, 615, 205, 1508]]

In [74]:
tokens_pad = pad_sequences(tokens, maxlen=MAX_REVIEW_LEN)
tokens_pad.shape

(8, 48)

In [78]:
prediction=model1.predict_classes(tokens_pad,verbose=1)



In [79]:
print(prediction)

[4 4 2 2 1 0 1 0]


In [81]:
test = pd.read_csv('test.tsv', sep="\t")
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [83]:
test['clean_review']=clean_data(test.Phrase.values)
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,clean_review
0,156061,8545,An intermittently pleasing but mostly routine ...,an intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...,an intermittently pleasing but mostly routine ...
2,156063,8545,An,an
3,156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


In [104]:
test_vectorized = vectorizer.transform(test['clean_review'])
test1 = test['clean_review'].values

In [105]:
X_test = tokenizer.texts_to_sequences(test1)
X_test

[[15, 3330, 1560, 20, 719, 789, 347],
 [15, 3330, 1560, 20, 719, 789, 347],
 [15],
 [3330, 1560, 20, 719, 789, 347],
 [3330, 1560, 20, 719, 789],
 [3330, 1560, 20],
 [3330, 1560],
 [3330],
 [1560],
 [20],
 [719, 789],
 [719],
 [789],
 [347],
 [],
 [2882,
  8,
  118,
  1,
  82,
  101,
  10,
  7,
  226,
  218,
  9,
  3122,
  328,
  2,
  11,
  24,
  1,
  1411,
  10543,
  9782,
  9783,
  55,
  6385,
  52,
  10,
  2144,
  2,
  245,
  154,
  1611,
  4,
  1,
  524,
  13022,
  4,
  911,
  48,
  9,
  1,
  1266,
  211,
  1525,
  141],
 [2882],
 [8,
  118,
  1,
  82,
  101,
  10,
  7,
  226,
  218,
  9,
  3122,
  328,
  2,
  11,
  24,
  1,
  1411,
  10543,
  9782,
  9783,
  55,
  6385,
  52,
  10,
  2144,
  2,
  245,
  154,
  1611,
  4,
  1,
  524,
  13022,
  4,
  911,
  48,
  9,
  1,
  1266,
  211,
  1525,
  141],
 [8,
  118,
  1,
  82,
  101,
  10,
  7,
  226,
  218,
  9,
  3122,
  328,
  2,
  11,
  24,
  1,
  1411,
  10543,
  9782,
  9783,
  55,
  6385,
  52,
  10,
  2144,
  2,
  245,
  154,
 

In [106]:
X_test = sequence.pad_sequences(X_test, maxlen = max_words)
X_test

array([[   0,    0,    0, ...,  719,  789,  347],
       [   0,    0,    0, ...,  719,  789,  347],
       [   0,    0,    0, ...,    0,    0,   15],
       ...,
       [   0,    0,    0, ...,    2,  127, 8227],
       [   0,    0,    0, ...,    2,  127, 8227],
       [   0,    0,    0, ...,    0,  360, 1802]])

In [107]:
pred = model1.predict_classes(X_test, verbose = 1)



In [130]:
sub = pd.read_csv('sampleSubmission.csv', sep = ",")
sub.Sentiment = pred
sub.to_csv('Result.csv', index = False)