In [None]:
import numpy as np
import pandas as pd
import re

## DOWNLOADING DATASETS

In [None]:
data_url="/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv"
train=pd.read_csv(data_url)

In [None]:
data_url="/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv"
movies=pd.read_csv(data_url)

## DATA CLEANING AND FILLING NAN VALUES

In [None]:
movies.isnull().mean()*100

In [None]:
movies['movieid'].value_counts()

In [None]:
movies=movies.drop_duplicates(subset=['movieid'])

In [None]:
movies.shape

In [None]:
# DROPED THE BELOW COLUMNS BECAUSE THEY COATAINED MORE THAN 40% OF NULL VALUES

In [None]:
movies=movies.drop(columns=['title','rating','ratingContents','releaseDateTheaters',
                            'releaseDateStreaming','boxOffice','distributor','soundType'])

In [None]:
train_merged=train.merge(movies,on='movieid',how='left')

In [None]:
train_merged['genre'].value_counts()

In [None]:
train_merged['originalLanguage'].value_counts()

In [None]:
train_merged['reviewText']=train_merged['reviewText'].fillna('')
train_merged['genre']=train_merged['genre'].fillna("Drama")
train_merged['originalLanguage']=train_merged['originalLanguage'].fillna("English")
train_merged['director']=train_merged['director'].fillna("")

In [None]:
train_merged=train_merged.fillna(value=train_merged['audienceScore'].mean())
train_merged=train_merged.fillna(value=train_merged['runtimeMinutes'].mean())

In [None]:
df=pd.DataFrame(train_merged)

In [None]:
correlation_matrix = df.corr()

In [None]:
correlation_matrix

In [None]:
# since no two columns are more than 90% co-related so there is no issue of Multicollinearity

## DATA VISUALIZATION

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.hist(train_merged['audienceScore'], bins=10)

In [None]:
# this shows that most of the audience has rated the movie within 60-75

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(train_merged['runtimeMinutes'], bins=10)

In [None]:
# this shows that the most of the movies are bewteen 80 minutes to 110 minutes.

In [None]:
train_merged_copy=train_merged.copy()

In [None]:
a=train_merged_copy['isFrequentReviewer'].astype(int)

In [None]:
plt.hist(a)

In [None]:
# isfrequentviewer tells us whether reviername watches movies frequently or not
# as we can see the ratio between 0 and is 2:1(roughly)

In [None]:
b=train_merged_copy['sentiment']

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
b=le.fit_transform(b)

In [None]:
plt.hist(b)

In [None]:
# this sentiment columns tells us whether the sentiment of the reviwername towards the,
# specific movie(movie_id) is positive(1) or negative(0)
# as we can see the ratio of that is 2:1

##  DATA PREPROCESSING

In [None]:
text=train_merged['reviewText']
y=train_merged['sentiment']

In [None]:
train_merged=train_merged.drop(columns=['movieid','reviewText','sentiment'])

In [None]:
def text_cleaner(text):
  text=re.sub(r' \d+' , '' , text)
  pattern=r"(.)\\1{2,}"
  text=re.sub(pattern, "\\1", text)
  return text

In [None]:
text=text.apply(text_cleaner)

In [None]:
one_hot=['reviewerName','isFrequentReviewer']
scale=['audienceScore','runtimeMinutes']

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [None]:
from sklearn.preprocessing import OneHotEncoder,RobustScaler,MultiLabelBinarizer,MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer

In [None]:
vector=CountVectorizer(ngram_range=(1,2))
text_vect = vector.fit_transform(text)

In [None]:
# col=ColumnTransformer([('one',OneHotEncoder(),one_hot),
#                        ('scale',RobustScaler(),scale)])
# train_transform=col.fit_transform(train_merged)

col_01=ColumnTransformer([('one',OneHotEncoder(),one_hot),
                       ('scale',MaxAbsScaler(),scale)])
train_transform=col_01.fit_transform(train_merged)

In [None]:
mul=MultiLabelBinarizer()
genre_trans=mul.fit_transform(train_merged['genre'])
lang_trans=mul.fit_transform(train_merged['originalLanguage'])
director_trans=mul.fit_transform(train_merged['director'])

In [None]:
from scipy.sparse import csr_matrix
import scipy.sparse as sp
genre_matrix=csr_matrix(genre_trans)
lang_matrix=csr_matrix(lang_trans)
director_matrix=csr_matrix(director_trans)
multi_trans=sp.hstack([genre_matrix,lang_matrix,director_matrix])

In [None]:
final_train_data=sp.hstack([multi_trans,train_transform,text_vect])

## IMPLEMENTING MODELS ON TRAIN DATASET

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(final_train_data,y,test_size=0.2,random_state=32)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [None]:
param_grid=[
    
    {'C':[0.1,0.01,10,1],
    'solver':['liblinear','saga']
    }
]


param_grid_01= {
    'alpha': [0.1, 1.0, 2.0],
    'fit_prior': [True, False]
}

param_grid_02= {
    'C': [0.1, 1.0],
    'loss': ['hinge', 'squared_hinge'],
    'penalty': ['l1', 'l2']
}

In [None]:
log_reg=LogisticRegression()
mnb=MultinomialNB()
svc=LinearSVC()

In [None]:
clf=GridSearchCV(log_reg,param_grid=param_grid,cv=3,verbose=True,n_jobs=-1)
# GridSearchCV in LogisticRegression class uses the logistic loss (cross-entropy loss) as its 
# default loss function for binary classification.

clf_01=GridSearchCV(mnb,param_grid=param_grid_01,cv=3,verbose=True,n_jobs=-1)
# GridSearchCV in MultinomialNB class uses the logistic loss (cross-entropy loss) as its 
# default loss function for binary classification.

clf_02=GridSearchCV(svc,param_grid=param_grid_02,cv=3,verbose=True,n_jobs=-1)
# GridSearchCV in LinearSVC class uses the hinge loss loss as its 
# default loss function for binary classification.

In [None]:
best_clf=clf.fit(x_train,y_train)

In [None]:
best_clf_mnb=clf_01.fit(x_train,y_train)

In [None]:
best_clf_svc=clf_02.fit(x_train,y_train)

In [None]:
best_clf.best_params_

In [None]:
best_clf_svc.best_params_

In [None]:
best_clf_mnb.best_params_

In [None]:
log_reg=LogisticRegression(C=1,solver='saga')

In [None]:
svc=LinearSVC(C=0.1,loss='squared_hinge',penalty='l2')

In [None]:
mnb=MultinomialNB(alpha=0.1,fit_prior=True)

In [None]:
log_reg.fit(x_train,y_train)

In [None]:
svc.fit(x_train,y_train)

In [None]:
mnb.fit(x_train,y_train)

In [None]:
y_pred1=log_reg.predict(x_test)

In [None]:
y_pred2=svc.predict(x_test)

In [None]:
y_pred3=mnb.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
target_names=['positive','negative']
cr=classification_report(y_test,y_pred1,target_names=target_names)
print(cr)

In [None]:
from sklearn.metrics import classification_report
target_names=['positive','negative']
cr=classification_report(y_test,y_pred2,target_names=target_names)
print(cr)

In [None]:
from sklearn.metrics import classification_report
target_names=['positive','negative']
cr=classification_report(y_test,y_pred3,target_names=target_names)
print(cr)

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred1)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred2)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred3)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

##  FITTING THE MODEL AND IMPORTANT LEARNINGS FROM THE MODEL

In [None]:
vector_01=CountVectorizer(ngram_range=(1,3))
text_vect_01= vector_01.fit_transform(text)

In [None]:
# col_02=ColumnTransformer([('one',OneHotEncoder(handle_unknown='ignore'),one_hot),
#                        ('scale',RobustScaler(),scale)])
# train_transform_01=col_02.fit_transform(train_merged)


col_03=ColumnTransformer([('one',OneHotEncoder(handle_unknown='ignore'),one_hot),
                       ('scale',MaxAbsScaler(),scale)])
train_transform_01=col_03.fit_transform(train_merged)

In [None]:
mul_01=MultiLabelBinarizer()
genre_trans_01=mul_01.fit_transform(train_merged['genre'])
lang_trans_01=mul_01.fit_transform(train_merged['originalLanguage'])
director_trans_01=mul_01.fit_transform(train_merged['director'])

In [None]:
from scipy.sparse import csr_matrix
import scipy.sparse as sp
genre_matrix_01=csr_matrix(genre_trans_01)
lang_matrix_01=csr_matrix(lang_trans_01)
director_matrix_01=csr_matrix(director_trans_01)
multi_trans_01=sp.hstack([genre_matrix_01,lang_matrix_01,director_matrix_01])

In [None]:
final_data=sp.hstack([multi_trans_01,train_transform_01,text_vect_01])

In [None]:
log_reg1=LogisticRegression()
svc1=LinearSVC()
mnb1=MultinomialNB()

In [None]:
clf=GridSearchCV(log_reg1,param_grid=param_grid,cv=3,verbose=True,n_jobs=-1)
clf_svc=GridSearchCV(svc1,param_grid=param_grid_02,cv=3,verbose=True,n_jobs=-1)
clf_mnb=GridSearchCV(mnb1,param_grid=param_grid_01,cv=3,verbose=True,n_jobs=-1)

In [None]:
best_clf1=clf.fit(final_data,y)

In [None]:
best_clf1_svc=clf_svc.fit(final_data,y)

In [None]:
best_clf1_mnb=clf_mnb.fit(final_data,y)

In [None]:
best_clf1.best_params_

In [None]:
best_clf1_svc.best_params_

In [None]:
best_clf1_mnb.best_params_

In [None]:
log_reg1=LogisticRegression(C=10,solver='saga')

In [None]:
svc1=LinearSVC(C=0.1,penalty="l2",loss="squared_hinge")

In [None]:
mnb1=MultinomialNB(alpha=1.0,fit_prior=False)

In [None]:
log_reg1.fit(final_data,y)

In [None]:
svc1.fit(final_data,y)

In [None]:
mnb1.fit(final_data,y)

In [None]:
y_pred02=log_reg1.predict(final_data)

In [None]:
y_pred02_svc=svc1.predict(final_data)

In [None]:
y_pred02_mnb=mnb1.predict(final_data)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_pred02)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_pred02_svc)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_pred02_mnb)

## LOADING TEST DATASET AND MOVIES DATASET 

In [None]:
data_url="/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv"
test=pd.read_csv(data_url)

In [None]:
data_url="/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv"
movies=pd.read_csv(data_url)

## MERGING TEST WITH MOVIES AND DATA CLEANING

In [None]:
movies=movies.drop_duplicates(subset=['movieid'])

In [None]:
movies=movies.drop(columns=['title','rating','ratingContents','releaseDateTheaters',
                            'releaseDateStreaming','boxOffice','distributor','soundType'])

In [None]:
test_merged=test.merge(movies,on='movieid',how='left')

In [None]:
test_merged['reviewText']=test_merged['reviewText'].fillna('')

In [None]:
test_merged['genre']=test_merged['genre'].fillna("Drama")
test_merged['originalLanguage']=test_merged['originalLanguage'].fillna("English")
test_merged['director']=test_merged['director'].fillna("")

In [None]:
column_names=['movieid','reviewerName','isFrequentReviewer','reviewText','audienceScore','runtimeMinutes',
              'genre','originalLanguage','director']
test_merged.columns=column_names

In [None]:
test_merged=test_merged.fillna(value=test_merged['audienceScore'].mean())
test_merged=test_merged.fillna(value=test_merged['runtimeMinutes'].mean())

In [None]:
text_test=test_merged['reviewText']

In [None]:
test_merged=test_merged.drop(columns=['movieid','reviewText'])

In [None]:
text_test=text_test.apply(text_cleaner)

## IMPLEMENTING SAME MODEL ON TEST DATASET

In [None]:
text_vect_test_01= vector_01.transform(text_test)

In [None]:
text_vect_test_01

In [None]:
test_transform_01=col_03.transform(test_merged)

In [None]:
test_transform_01

In [None]:
genre_test_trans_01=mul_01.fit_transform(test_merged['genre'])
lang_test_trans_01=mul_01.fit_transform(test_merged['originalLanguage'])
director_test_trans_01=mul_01.fit_transform(test_merged['director'])

In [None]:
from scipy.sparse import csr_matrix
import scipy.sparse as sp
genre_test_matrix_01=csr_matrix(genre_test_trans_01)
lang_test_matrix_01=csr_matrix(lang_test_trans_01)
director_test_matrix_01=csr_matrix(director_test_trans_01)
multi_test_trans_01=sp.hstack([genre_test_matrix_01,lang_test_matrix_01,director_test_matrix_01])

In [None]:
multi_test_trans_01

In [None]:
final_data=sp.hstack([multi_test_trans_01,test_transform_01,text_vect_test_01])

In [None]:
final_data

In [None]:
y_pred03=pd.DataFrame(log_reg1.predict(final_data))

In [None]:
# y_pred03=pd.DataFrame(svc1.predict(x_test02))

In [None]:
# y_pred03=pd.DataFrame(mnb1.predict(final_data))

In [None]:
type(y_pred03)

In [None]:
y_pred03

## SUBMISSION TO THE KAGGLE COMPETITION.

In [None]:
y_pred04 = le.inverse_transform(y_pred03)

In [None]:
type(y_pred04)

In [None]:
y_pred05 = pd.Series(y_pred04)

In [None]:
y_pred06 = y_pred05.to_frame(name="sentiment")
# converts series into dataframe.

In [None]:
y_pred06

In [None]:
y_pred06.columns=['sentiment']
y_pred06.index.name='id'
y_pred06.to_csv('submission.csv')