# This program is created to review sentiment prediction for given data set

Importing Required libraries

In [1]:
import pandas as pd
import numpy as np

import os, re
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,precision_score

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

Importing given dataset and sample files

In [2]:
train = pd.read_csv("TRAIN Review Sentiment Prediction.csv")
test = pd.read_csv("TEST Review Sentiment Prediction.csv")
sub = pd.read_csv("sample_submission Review Sentiment Prediction.csv")

Verifying Dataset

In [3]:
train.head()

Unnamed: 0,id,text,category
0,3973,1047 6700 4178 2912 183 6700,0
1,4462,4035 4563 1994 154 5373 4956 6035 4393 4017 40...,1
2,2889,5035 7685 3139 4407 254 584 6589 8518 3078,0
3,3457,1047 2540 2578 6700 2242 230 3771 4676 5881 64...,0
4,1555,7685 5868 237 6794 3078,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2598 entries, 0 to 2597
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2598 non-null   int64 
 1   text      2598 non-null   object
 2   category  2598 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 61.0+ KB


merging test and training data set and identifying the text length

In [5]:
merge = pd.concat([train,test]).reset_index(drop=True)

In [6]:
merge = pd.concat([train,test]).reset_index(drop=True)
merge["description_word_len"] = merge.apply(lambda x:len(re.findall(r"\w+",x['text'])),axis=1)

In [7]:
merge.head()

Unnamed: 0,id,text,category,description_word_len
0,3973,1047 6700 4178 2912 183 6700,0.0,6
1,4462,4035 4563 1994 154 5373 4956 6035 4393 4017 40...,1.0,29
2,2889,5035 7685 3139 4407 254 584 6589 8518 3078,0.0,9
3,3457,1047 2540 2578 6700 2242 230 3771 4676 5881 64...,0.0,20
4,1555,7685 5868 237 6794 3078,0.0,5


Importing libraries and verifying tensorflow version

In [8]:
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
import tokenizers
print('TF version',tf.__version__)

TF version 2.4.0


In [9]:
data = merge

Removing the unnecessary data from the dataset

In [10]:
df_train = data[~data.category.isna()]
df_test = data[data.category.isna()]
df_test.drop("category",axis=1,inplace=True)

In [11]:
X_train = df_train['text'].values
Y_train = df_train['category'].values
X_test = df_test['text'].values

In [12]:
X_train.shape,Y_train.shape

((2598,), (2598,))

Importing libraries

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Tokenizing the data given in dataset

In [14]:
t = Tokenizer()
t.fit_on_texts(X_train)
X_train_token = t.texts_to_sequences(X_train)
X_test_token = t.texts_to_sequences(X_test)

Performing padding to match the text length and checking shape of the data

In [15]:
sent_length = 50
X_train_token_pad = pad_sequences(X_train_token,padding='pre',maxlen=sent_length)
X_test_token_pad = pad_sequences(X_test_token,padding='pre',maxlen=sent_length)

In [16]:
X_train_token_pad.shape

(2598, 50)

Importing libraries for modeling the data and perform modeling

In [17]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense,SpatialDropout1D,Dropout, LSTM,Embedding
import keras
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense,SpatialDropout1D,Dropout

In [18]:
embedding_vector_features=100
voc_size = len(t.word_index)+1
model=Sequential()

model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           736300    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 300)               481200    
_________________________________________________________________
dense (Dense)                (None, 1024)              308224    
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0

In [19]:
X_train_token_pad.shape,Y_train.shape

((2598, 50), (2598,))

In [20]:
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
X_train_token_pad, X_val_token_pad, Y_train_token_pad, Y_val_token_pad = train_test_split(X_train_token_pad, Y_train, test_size=0.33, random_state=42)
Y_train_token_pad = np_utils.to_categorical(Y_train_token_pad)
Y_val_token_pad = np_utils.to_categorical(Y_val_token_pad)


Checking model fitting

In [None]:
model.fit(X_train_token_pad,Y_train_token_pad,validation_data=(X_val_token_pad,Y_val_token_pad),epochs=8,batch_size=64)
Y_pred_token_pre = model.predict_classes(X_test_token_pad)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8

In [None]:
submission = pd.read_csv("sample_submission Review Sentiment Prediction.csv")

submission['category'] = Y_pred_token_pre
submission.to_csv('Sample_Submission.csv', index=False)