In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import cudf as pd
import cupy as cp
import cuml
import nltk
import re
import string
import keras
import tensorflow
from tensorflow.keras.optimizers import Adam
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.naive_bayes import MultinomialNB
from cuml.svm import SVC
from cuml.linear_model import LogisticRegression
from cuml.multiclass import MulticlassClassifier
from cuml.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip


#### 1. Reading and Understanding Data

In [2]:
train=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

In [3]:
train.head()
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [4]:
train['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int32

In [5]:
train=train.to_pandas()
test=test.to_pandas()

#### 2. Removing NULL Values 

In [6]:
train.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [7]:
test.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
dtype: int64

#### 3. Make Text lowecase

In [8]:
train['Phrase']=train['Phrase'].apply(lambda x: x.lower())
test['Phrase']=test['Phrase'].apply(lambda x: x.lower())

In [9]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


#### 3. Remove Non alphabet Characters 

In [10]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [11]:
train['Phrase']=train['Phrase'].apply(lambda x: x.translate(str.maketrans('','',punct)).lower())
test['Phrase']=test['Phrase'].apply(lambda x: x.translate(str.maketrans('','',punct)).lower())

In [12]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


#### 4. Remove Stop Words

In [13]:
def Stop(text):
    str=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            str.append(i) 
    final=str[:]
    str.clear()
    return final

In [14]:
train['Phrase']=train['Phrase'].apply(Stop)
test['Phrase']=test['Phrase'].apply(Stop)

In [15]:
ps=PorterStemmer()

In [16]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [17]:
train['Phrase']=train['Phrase'].apply(stem_words)
test['Phrase']=test['Phrase'].apply(stem_words)

In [18]:
def join_back(list_input):
    return " ".join(list_input)

In [19]:
train['Phrase']=train['Phrase'].apply(join_back)
test['Phrase']=test['Phrase'].apply(join_back)

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
X_train =train['Phrase']
y_train = train['Sentiment']
tokenize = Tokenizer()
tokenize.fit_on_texts(X_train.values)

In [22]:
X_test = test['Phrase']
X_train = tokenize.texts_to_sequences(X_train)
X_test = tokenize.texts_to_sequences(X_test)

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
max_sequence_len = max([len(s.split()) for s in train['Phrase']])
X_train = pad_sequences(X_train, max_sequence_len,padding='pre')
X_test = pad_sequences(X_test, max_sequence_len,padding='pre')

## MODELS 

#### 1. CNN

In [25]:
import tensorflow as tf
CNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenize.word_index)+1, 100, input_length=max_sequence_len),
    tf.keras.layers.Conv1D(128, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])

2022-03-16 23:06:33.751970: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 23:06:33.756902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 23:06:33.757662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-16 23:06:33.758766: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [26]:
CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
history_CNN=CNN.fit(X_train, y_train,batch_size=128, epochs=45, verbose=1)

2022-03-16 23:06:34.489335: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/45


2022-03-16 23:06:36.031925: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [28]:
CNN.save('CNN_Model.h5')

In [29]:
import numpy as np

In [30]:
predict_x_CNN=CNN.predict(X_test) 
classes_x_CNN=np.argmax(predict_x_CNN,axis=1)

In [31]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=classes_x_CNN
final_df

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,3
...,...,...
66287,222348,1
66288,222349,1
66289,222350,1
66290,222351,1


In [32]:
final_df.shape

(66292, 2)

In [33]:
filename='./submission.csv'
final_df.to_csv(filename,index=False)

#### 2. Logistic Regression

In [34]:
LR = OneVsRestClassifier(LogisticRegression())
LR.fit(X_train.astype('float32'),y_train.astype('float32'))
LRPred = LR.predict(X_test.astype('float32'))
LRPred = LRPred.astype('int32')


[W] [23:11:48.785994] L-BFGS: max iterations reached
[W] [23:11:48.786367] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:11:49.584707] L-BFGS: max iterations reached
[W] [23:11:49.585562] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:11:50.532043] L-BFGS: max iterations reached
[W] [23:11:50.533035] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:11:51.497096] L-BFGS: max iterations reached
[W] [23:11:51.498019] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the inpu

In [35]:
print(cuml.metrics.accuracy_score(y_train, LRPred))

0.5083116888999939


In [36]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=LRPred
final_df

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2
...,...,...
66287,222348,2
66288,222349,2
66289,222350,2
66290,222351,2


In [37]:
# filename='./submission.csv'
# final_df.to_csv(filename,index=False)

#### 3. Random forest

In [38]:
Model= cuRFC(max_features=1.0,n_bins=8,n_estimators=40)
Model.fit(X_train.astype('float32'),y_train.astype('float32'))
RFPred = Model.predict(X_test.astype('float32'))
RFPRred = RFPred.astype('int32')

In [39]:
print(cuml.metrics.accuracy_score(y_train, RFPred))

0.4602515995502472


In [40]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=RFPred
final_df

Unnamed: 0,PhraseId,Sentiment
0,156061,3.0
1,156062,3.0
2,156063,2.0
3,156064,3.0
4,156065,2.0
...,...,...
66287,222348,2.0
66288,222349,2.0
66289,222350,2.0
66290,222351,2.0


In [41]:
# filename='./submission.csv'
# final_df.to_csv(filename,index=False)

#### 4. Naive Bayes

In [42]:
bayes = MultinomialNB()
bayes.fit(X_train, y_train)
bayesPred=bayes.predict(X_test)

In [43]:
print(cuml.metrics.accuracy_score(y_train, bayesPred ))

0.424983412027359


In [44]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=bayesPred
final_df

Unnamed: 0,PhraseId,Sentiment
0,156061,0
1,156062,0
2,156063,2
3,156064,0
4,156065,1
...,...,...
66287,222348,2
66288,222349,2
66289,222350,2
66290,222351,2


In [45]:
# filename='./submission.csv'
# final_df.to_csv(filename,index=False)