In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import cudf as pd
import cupy as cp
import cuml
import nltk
import re
import string
import keras
import tensorflow
from tensorflow.keras.optimizers import Adam
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.naive_bayes import MultinomialNB
from cuml.svm import SVC
from cuml.linear_model import LogisticRegression
from cuml.multiclass import MulticlassClassifier
from cuml.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

#### 1. Reading and Understanding Data

In [None]:
train=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

In [None]:
train.head()
test.head()

In [None]:
train['Sentiment'].value_counts()

In [None]:
train=train.to_pandas()
test=test.to_pandas()

#### 2. Removing NULL Values 

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

#### 3. Make Text lowecase

In [None]:
train['Phrase']=train['Phrase'].apply(lambda x: x.lower())
test['Phrase']=test['Phrase'].apply(lambda x: x.lower())

In [None]:
train.head()

#### 3. Remove Non alphabet Characters 

In [None]:
punct = string.punctuation
print(punct)

In [None]:
train['Phrase']=train['Phrase'].apply(lambda x: x.translate(str.maketrans('','',punct)).lower())
test['Phrase']=test['Phrase'].apply(lambda x: x.translate(str.maketrans('','',punct)).lower())

In [None]:
train.head()

#### 4. Remove Stop Words

In [None]:
def Stop(text):
    str=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            str.append(i) 
    final=str[:]
    str.clear()
    return final

In [None]:
train['Phrase']=train['Phrase'].apply(Stop)
test['Phrase']=test['Phrase'].apply(Stop)

In [None]:
ps=PorterStemmer()

In [None]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [None]:
train['Phrase']=train['Phrase'].apply(stem_words)
test['Phrase']=test['Phrase'].apply(stem_words)

In [None]:
def join_back(list_input):
    return " ".join(list_input)

In [None]:
train['Phrase']=train['Phrase'].apply(join_back)
test['Phrase']=test['Phrase'].apply(join_back)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
X_train =train['Phrase']
y_train = train['Sentiment']
tokenize = Tokenizer()
tokenize.fit_on_texts(X_train.values)

In [None]:
X_test = test['Phrase']
X_train = tokenize.texts_to_sequences(X_train)
X_test = tokenize.texts_to_sequences(X_test)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_sequence_len = max([len(s.split()) for s in train['Phrase']])
X_train = pad_sequences(X_train, max_sequence_len,padding='pre')
X_test = pad_sequences(X_test, max_sequence_len,padding='pre')

## MODELS 

#### 1. CNN

In [None]:
import tensorflow as tf
CNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenize.word_index)+1, 100, input_length=max_sequence_len),
    tf.keras.layers.Conv1D(128, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history_CNN=CNN.fit(X_train, y_train,batch_size=128, epochs=45, verbose=1)

In [None]:
CNN.save('CNN_Model.h5')

In [None]:
import numpy as np

In [None]:
predict_x_CNN=CNN.predict(X_test) 
classes_x_CNN=np.argmax(predict_x_CNN,axis=1)

In [None]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=classes_x_CNN
final_df

In [None]:
filename='./submit.csv'
final_df.to_csv(filename,index=False)

#### 2. Logistic Regression

In [None]:
LR = OneVsRestClassifier(LogisticRegression())
LR.fit(X_train.astype('float32'),y_train.astype('float32'))
LRPred = LR.predict(X_test.astype('float32'))
LRPred = LRPred.astype('int32')


In [None]:
print(cuml.metrics.accuracy_score(y_train, LRPred))

In [None]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=LRPRed
final_df

In [None]:
filename='./submit.csv'
final_df.to_csv(filename,index=False)

#### 3. Random forest

In [None]:
Model= cuRFC(max_features=1.0,n_bins=8,n_estimators=40)
Model.fit(X_train.astype('float32'),y_train.astype('float32'))
RFPred = Model.predict(X_test.astype('float32'))
RFPRred = RFPred.astype('int32')

In [None]:
print(cuml.metrics.accuracy_score(y_train, RFPred))

In [None]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=RFPred
final_df

In [None]:
filename='./submit.csv'
final_df.to_csv(filename,index=False)

#### 4. Naive Bayes

In [None]:
bayes = MultinomialNB()
bayes.fit(X_train, y_train)
bayesPred=bayes.predict(X_test)

In [None]:
print(cuml.metrics.accuracy_score(y_train, bayesPred ))

In [None]:
final_df=pd.DataFrame(test['PhraseId'],columns=['PhraseId'])
final_df['Sentiment']=bayesPred
final_df

In [None]:
filename='./submit.csv'
final_df.to_csv(filename,index=False)