#### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#### Loading the dataset

In [None]:
data = pd.read_csv('movies_sentiment_data.csv')
data.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [None]:
data.shape

(19000, 2)

In [None]:
data['sentiment'].unique() # identify unique values

array(['positive', 'negative'], dtype=object)

In [None]:
data.isnull().sum() #finding null values

review       0
sentiment    0
dtype: int64

In [None]:
data.duplicated().sum() #identifying duplicates

61

In [None]:
data = data.drop_duplicates() #dropping the duplicates

In [None]:
data['Category'] = data['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)  #mapping

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Category'] = data['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)


In [None]:
data.head()

Unnamed: 0,review,sentiment,Category
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [None]:
data['Category'].value_counts()

Category
1    9481
0    9458
Name: count, dtype: int64

#### Preprocessing the data

In [None]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [None]:
data['preprocessed_review'] = data['review'].apply(preprocess)

In [None]:
data.head()

Unnamed: 0,review,sentiment,Category,preprocessed_review
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1,see jake gyllenhaal jarhead 2005 little watch ...
1,I enjoyed the movie and the story immensely! I...,positive,1,enjoy movie story immensely see original(1939 ...
2,I had a hard time sitting through this. Every ...,negative,0,hard time sit single twist turn predictable si...
3,It's hard to imagine that anyone could find th...,negative,0,hard imagine find short favorite see short kno...
4,This is one military drama I like a lot! Tom B...,positive,1,military drama like lot tom berenger play mili...


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.preprocessed_review,
    data.Category,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=data.Category
)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (15151,)
Shape of X_test:  (3788,)


#### Model Training and Evaluation

##### Random Forest Classifier

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.33      0.47      1892
           1       0.58      0.91      0.71      1896

    accuracy                           0.62      3788
   macro avg       0.68      0.62      0.59      3788
weighted avg       0.68      0.62      0.59      3788



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf1 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf1.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf1.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85      1892
           1       0.86      0.85      0.85      1896

    accuracy                           0.85      3788
   macro avg       0.85      0.85      0.85      3788
weighted avg       0.85      0.85      0.85      3788



##### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf2 = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf2.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf2.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1892
           1       0.89      0.86      0.87      1896

    accuracy                           0.88      3788
   macro avg       0.88      0.88      0.88      3788
weighted avg       0.88      0.88      0.88      3788



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf3 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf3.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf3.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1892
           1       0.89      0.84      0.86      1896

    accuracy                           0.87      3788
   macro avg       0.87      0.87      0.87      3788
weighted avg       0.87      0.87      0.87      3788



##### Support Vector Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf4 = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
     ('svm', SVC(kernel='linear'))
])

# 2. Fit the pipeline with X_train and y_train
clf4.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf4.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1892
           1       0.87      0.89      0.88      1896

    accuracy                           0.88      3788
   macro avg       0.88      0.88      0.88      3788
weighted avg       0.88      0.88      0.88      3788



##### K Nearest Neighbour

In [None]:
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf5 = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
      ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

# 2. Fit the pipeline with X_train and y_train
clf5.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf5.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.82      0.77      1892
           1       0.79      0.70      0.74      1896

    accuracy                           0.76      3788
   macro avg       0.76      0.76      0.76      3788
weighted avg       0.76      0.76      0.76      3788



##### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and Gradient Boosting classifier
clf5 = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

# 2. Fit the pipeline with X_train and y_train
clf5.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf5.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.74      0.79      1892
           1       0.77      0.86      0.81      1896

    accuracy                           0.80      3788
   macro avg       0.81      0.80      0.80      3788
weighted avg       0.81      0.80      0.80      3788



##### Trail with LSTM (*needs more improvisation*)

In [None]:
!pip install keras



In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Activation, Flatten, GlobalMaxPooling1D, Conv1D


In [None]:
# Define parameters
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data['preprocessed_review'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(data['preprocessed_review'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded, data['Category'], test_size=0.2, random_state=42, stratify=data['Category'])

# Convert labels to numpy array
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 bidirectional (Bidirection  (None, 128)               41472     
 al)                                                             
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 204593 (799.19 KB)
Trainable params: 204593 (799.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10
474/474 - 54s - loss: 0.4546 - accuracy: 0.7798 - val_loss: 0.3379 - val_accuracy: 0.8582 - 54s/epoch - 114ms/step
Epoch 2/10
474/474 - 41s - loss: 0.2597 - accuracy: 0.9014 - val_loss: 0.3372 - val_accuracy: 0.8606 - 41s/epoch - 86ms/step
Epoch 3/10
474/474 - 42s - loss: 0.1858 - accuracy: 0.9330 - val_loss: 0.3730 - val_accuracy: 0.8582 - 42s/epoch - 90ms/step
Epoch 4/10
474/474 - 42s - loss: 0.1312 - accuracy: 0.9541 - val_loss: 0.4759 - val_accuracy: 0.8522 - 42s/epoch - 88ms/step
Epoch 5/10
474/474 - 42s - loss: 0.0916 - accuracy: 0.9700 - val_loss: 0.4794 - val_accuracy: 0.8487 - 42s/epoch - 89ms/step
Epoch 6/10
474/474 - 42s - loss: 0.0742 - accuracy: 0.9766 - val_loss: 0.7177 - val_accuracy: 0.8390 - 42s/epoch - 90ms/step
Epoch 7/10
474/474 - 42s - loss: 0.0558 - accuracy: 0.9824 - val_loss: 0.7240 - val_accuracy: 0.8316 - 42s/epoch - 89ms/step
Epoch 8/10
474/474 - 41s - loss: 0.0510 - accuracy: 0.9827 - val_loss: 0.7210 - val_accuracy: 0.8353 - 41s/epoch - 87ms/step

In [None]:
# Get predictions for the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Print classification report
print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1892
           1       0.83      0.85      0.84      1896

    accuracy                           0.84      3788
   macro avg       0.84      0.84      0.84      3788
weighted avg       0.84      0.84      0.84      3788



#### Saving model with better performance

In [None]:
import pickle

# Save the model to a file
with open('sentiment_model_movie.pkl', 'wb') as f:
    pickle.dump(clf4, f)

In [None]:
data.to_csv('clean_movie_data.csv', index=False)