In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

* #### Understanding of the Problem Statement and Challenges faced initially
* #### Understanding of the Datasets given
* #### Explanation of the Data Insights
* #### Explanation of Data Preprocessing
* #### Feature Engineering used
* #### Feature Selection used
* #### Models built
* #### Analysis of the performance of the Models
* #### Key Learnings

In [None]:
movies_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv')
sample = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/sample.csv')
train_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')
test_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')

In [None]:
print(movies_data.shape)
print(sample.shape)
print(train_data.shape)
print(test_data.shape)

# Understanding the Problem Statement
Your task is to build an ML model to predict sentiment of the review text.

Challenges faced:
-Text analysis and categorical data

-Memory allocation error : Size of the dataset being too high after one hot encoding of some columns

-Merging of movies_data and train_data : technical troubles, and identifying which columns were important

# Reviewing the Datasets to get an Idea
**Here the train, test, movies, and sample datasets are shown**

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
sample.head()

In [None]:
movies_data.head()
#movies_data.shape

Now removing all those rows with sentiment given null.
But there are actually no null values in the sentiment column.

# Data Insights

In [None]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(movies_data, title='Movies_data report')
# profile

In [None]:
print(train_data.shape)
train_data.describe(include = 'all')
#Each movieid has multiple reviews associated with it, done by different reviewers.
#Hence duplicates can be allowed in case of movieid in train_data.
#Duplicates also allowed in reviewerName column

In [None]:
print(train_data['sentiment'].value_counts())
print('positive sentiment', 108761 / 162758)

In [None]:
print(movies_data.shape)
movies_data.describe()

In [None]:
movies_data.describe(include = 'object')
# movies_data gives information about various movies.
# A single movieid can only have certain specifications.
# Thus no duplicates must be allowed in the movies_data dataset.

In [None]:
train_data.info()
#No Numeric columns exist in this dataset

In [None]:
movies_data.info()
#Only two numeric columns exist in this dataset

### Plotting the null Value percentages of various columns in movies_data

In [None]:
import matplotlib.pyplot as plt

# Calculate the number of null values for each column
null_counts = movies_data.isnull().sum()
null_percentages = (null_counts/143258)*100

# Plot the number of null values by column
null_percentages.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Columns')
plt.ylabel('Number of Null Values')
plt.title('Number of Null Values by Column')
plt.xticks(rotation=45, ha='right')
plt.show()

#### We can see that rating, ratingContents, releasedatetheatres, boxoffice, distributor, soundtype have roughly 80% Null Values. 
#### Thus we can safely remove these columns.

In [None]:
import matplotlib.pyplot as plt

# Calculate the number of null values for each column
null_counts = train_data.isnull().sum()

# Plot the number of null values by column
null_counts.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Columns')
plt.ylabel('Number of Null Values')
plt.title('Number of Null Values by Column')
plt.xticks(rotation=45, ha='right')
plt.show()

#### The reviewText column has some null values and hence must be dealt with.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(movies_data.corr(numeric_only = True), annot=True, cmap='coolwarm')
plt.title('Heatmap of the Dataset')
plt.show()

#### Inferences
* Weak positive correlation between the two variables.
* Implies that both the features are important, and **not** redundant in explaining the changes in the response variable.

## Preprocessing the Train and Test dataset

In [None]:
# train_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

#Encoding the isFrequentReviewer column
train_data['isFrequentReviewer'] = train_data['isFrequentReviewer'].map({False : 0, True : 1})
test_data['isFrequentReviewer'] = test_data['isTopCritic'].map({False : 0, True : 1})
test_data.drop(columns = ['isTopCritic'], axis = 1, inplace = True)

#Removing null values from reviewText column
train_data['reviewText'] = train_data['reviewText'].fillna('')
test_data['reviewText'] = test_data['reviewText'].fillna('')

#Encoding sentiment_target
train_data['sentiment_target'] = train_data['sentiment'].map({'POSITIVE':1, 'NEGATIVE':0})
train_data.drop(columns = ['sentiment'], axis = 1, inplace = True)

In [None]:
# movies_data.shape
# movies_data['director'].shape

In [None]:
train_data.head()

In [None]:
test_data.head()

## Preprocessing the Movies Dataset

In [None]:
movies_data.isnull().sum()

In [None]:
print('movies_data: shape before preprocessing', movies_data.shape)

Finding out the Null Percentages to eliminate those with too many Null Values

In [None]:
null_counts = movies_data.isnull().sum()
null_percentages = (null_counts/143258)*100
movies_data_null= pd.DataFrame({'Null Count': null_counts, 'Null Percentages': null_percentages})
movies_data_null

In [None]:
movies_data_null.loc[movies_data_null['Null Percentages'] > 80]

In [None]:
movies_data.drop(['rating','ratingContents','boxOffice','distributor','soundType'], axis = 1, inplace = True)

##### Merging ReleaseDateTheatres and ReleaseDateStreaming.

##### Scooping out those values with ReleaseDateTheatres = not null and ReleaseDateStreaming = null

##### ReleaseDateTheatres has 78% Null values, and can be dropped. 

In [None]:
filtered_data = movies_data.loc[(~movies_data['releaseDateTheaters'].isnull()) & (movies_data['releaseDateStreaming'].isnull())]
filtered_data.shape

In [None]:
condition = ((~movies_data['releaseDateTheaters'].isnull()) & (movies_data['releaseDateStreaming'].isnull()))
movies_data.loc[condition, 'releaseDateStreaming'] = movies_data.loc[condition, 'releaseDateTheaters']
movies_data.drop('releaseDateTheaters', axis = 1, inplace = True)
print('movies_data shape after preprocessing', movies_data.shape)
movies_data['releaseDateStreaming'] = pd.to_datetime(movies_data['releaseDateStreaming'])

In [None]:
null_counts = movies_data.isnull().sum()
null_percentages = (null_counts/143258)*100
movies_data_null = pd.DataFrame({'Null Count': null_counts, 'Null Percentages': null_percentages})
movies_data_null

#### Original Language Column
Null percentage : 9.6%

In [None]:
most_frequent_value = movies_data['originalLanguage'].mode().iloc[0]
movies_data['originalLanguage'].fillna(most_frequent_value, inplace=True)

#### Removing duplicate movieids from movies_dataset

In [None]:
dup_movies_data = movies_data[movies_data['movieid'] == 'escape_the_terminator_tyler_durden_astonish']
print(dup_movies_data.isnull().sum())
print(len(dup_movies_data))

In [None]:
row_to_copy = movies_data.iloc[4038].copy()
type(row_to_copy)
print(row_to_copy)

In [None]:
row_to_copy = row_to_copy.to_dict()
type(row_to_copy)

# Convert new_row_data to DataFrame
new_row_df = pd.DataFrame([row_to_copy])

#Now delete all the rows with the movieid = 'escape_the_terminator_tyler_durden_astonish'
movies_data = movies_data[movies_data['movieid'] != 'escape_the_terminator_tyler_durden_astonish']

# Concatenate the new row DataFrame with the original DataFrame
movies_data = pd.concat([movies_data, new_row_df], ignore_index=True)

In [None]:
#Removing the rest of the duplicate values
movies_data = movies_data.drop_duplicates(subset=['movieid'])

#Checking for duplicates
movies_data_val_counts = movies_data['movieid'].value_counts()
movies_data_val_counts[movies_data_val_counts > 1]

In [None]:
movies_data.isnull().sum()

### Removing Null Values of other columns

In [None]:
mode_runtime = movies_data['runtimeMinutes'].mode()[0]
movies_data['runtimeMinutes'].fillna(mode_runtime, inplace = True)

mf_genre = movies_data['genre'].mode()[0]
movies_data['genre'].fillna(mf_genre, inplace = True)

print(movies_data['audienceScore'].mean())
print(movies_data['audienceScore'].mode()[0])
mean_audience = int(movies_data['audienceScore'].mean())
print(mean_audience)
movies_data['audienceScore'].fillna(mean_audience, inplace = True)

In [None]:
movies_data.isnull().sum()

## Feature Engineering of Genre Column

In [None]:
movies_data['genre'].head(10)

In [None]:
print('movies_data.shape before preprocessing', movies_data.shape)

In [None]:
genre_data = movies_data['genre'].value_counts()
#len(genre_data[genre_data == 1])
#genre_data.loc[genre_data == 1]

In [None]:
# pd.reset_option('display.max_rows')

In [None]:
movies_data.genre.isnull().sum()

In [None]:
pd.reset_option('display.max_rows')

In [None]:
# Step 1: Split the comma-separated genres and create a list of genres for each movie
movies_data['genre'] = movies_data['genre'].str.split(',')
movies_data['genre'].head(10)

In [None]:
# Step 2: Get unique genres by creating a set
# print(genre_list)
all_genres = set(g for genre_list in movies_data['genre'] for g in genre_list)
print(len(all_genres))
print(all_genres)

In [None]:
# Step 3: Create a dictionary to store binary values for genres
genre_dict = {genre: [] for genre in all_genres}
print(genre_dict)

# Step 4: Set binary values
for genre in all_genres:
    genre_dict[genre] = movies_data['genre'].apply(lambda x: 1 if genre in x else 0)
# print(genre_dict)

In [None]:
# Step 5: Combine the binary values into a new DataFrame
genre_df = pd.DataFrame(genre_dict)

# Concatenate the new DataFrame with the original DataFrame
movies_data = pd.concat([movies_data, genre_df], axis=1)

# Drop the original 'genre' column
movies_data.drop(columns='genre', inplace=True)

In [None]:
# genre_df

In [None]:
print(movies_data.shape)

In [None]:
movies_data.columns

#### Problem : 'Action' and ' Action', both columns exist, (one with and one without preceding space, creating problems), merging these columns

In [None]:
movies_data.columns

In [None]:
# total = 0
# for i in movies_data.columns[7:]:
#     summ = np.sum(movies_data[i].values)
#     print(i, summ)
#     total += summ
# #print('total', total)

In [None]:
l = movies_data.columns.sort_values()
l_with_space = l[:37]
l_no_space = l[37:]

In [None]:
l

In [None]:
def add_space(string):
    return (' ' + string)

In [None]:
l3 = l_no_space[:30]
l3 = list(l3)
print(l3)
l3.remove('Special interest') #No version of 'spaced' Special interest exists

In [None]:
for i in l3:
    space_i = add_space(i)
    print("Merging {} and {}".format(i, space_i))
    movies_data[i] = movies_data[i] | movies_data[space_i]
    movies_data.drop(columns=space_i, inplace=True)

In [None]:
print('movies_data shape after preprocessing',movies_data.shape)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(movies_data.corr(), annot=False, cmap='coolwarm')
plt.title('Heatmap of the Dataset')
plt.show()

### Merging a few other columns

In [None]:
#By Domain Knowledge
movies_data['Lgbtq+'] = movies_data['Lgbtq+'] | movies_data['Gay & lesbian']
movies_data.drop(columns = 'Gay & lesbian', inplace = True)

movies_data['Sports'] = movies_data['Sports'] | movies_data[' Sports & fitness']
movies_data.drop(columns = ' Sports & fitness', inplace = True)

movies_data['Family'] = movies_data['Holiday'] + movies_data['Kids & family']
movies_data.drop(columns = ['Holiday', 'Kids & family'], inplace = True)

movies_data['Music'] = movies_data['Music'] | movies_data['Musical']
movies_data.drop(columns = 'Musical', inplace = True)

In [None]:
cols_list = movies_data.columns[7:]
occurrences = movies_data[cols_list].sum()
occurrences

In [None]:
sorted_occurrences = occurrences.sort_values(ascending=False)
print(sorted_occurrences[sorted_occurrences < 218])
print()

sl = list(sorted_occurrences[sorted_occurrences < 218].index)
sl.remove('Other')
print(sl)
# print(sorted_occurrences.sum())

In [None]:
for i in sl:
    movies_data['Other'] = movies_data['Other'] | movies_data[i]
    movies_data.drop(columns = i, inplace = True)

In [None]:
cols_list = movies_data.columns[7:]
occurrences = movies_data[cols_list].sum()
print(occurrences.sort_values())
print(len(occurrences))

In [None]:
movies_data.head()

In [None]:
null_counts = movies_data.isnull().sum()
null_percentages = (null_counts/143258)*100
movies_data_null = pd.DataFrame({'Null Count': null_counts, 'Null Percentages': null_percentages})
movies_data_null

In [None]:
movies_data.describe(include = 'all', datetime_is_numeric = True)

## Scaling the features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_to_scale = movies_data['audienceScore'].values.reshape(-1, 1)
scaler.fit(data_to_scale)
scaled_data = scaler.transform(data_to_scale)
movies_data['audienceScore'] = scaled_data

In [None]:
import matplotlib.pyplot as plt

# Create a boxplot of the 'runtimeMinutes' column
plt.figure(figsize=(8, 6))  # Adjust the figure size if needed
plt.boxplot(movies_data['runtimeMinutes'])  # vert=False for horizontal boxplot
plt.title('Boxplot of Runtime Minutes')
plt.xlabel('Runtime Minutes')
plt.ylabel('Movies')
plt.show()

In [None]:
#From Domain knowledge, any movie with a runtime of more than around 4 Hour 30 Minutes is less likely
#Keeping a margin of 30 minutes, we can replace the outliers with the maximum value of 300 mins
mode_runtime = movies_data['runtimeMinutes'].mode()[0]
exceeds_300_mask = movies_data['runtimeMinutes'] > 300
movies_data.loc[exceeds_300_mask, 'runtimeMinutes'] = mode_runtime

In [None]:
data_runtime = movies_data['runtimeMinutes'].values.reshape(-1,1)
scaler.fit(data_runtime)
scaled_runtime = scaler.transform(data_runtime)
movies_data['runtimeMinutes'] = scaled_runtime

In [None]:
movies_data.head()

## Merging the Train and Test datasets with the movies Dataset

In [None]:
merged_train = pd.merge(train_data, movies_data, on='movieid', how='left')
merged_test = pd.merge(test_data, movies_data, on='movieid', how='left')

In [None]:
merged_train.columns

In [None]:
pd.set_option('display.max_rows', None)
lang_counts = merged_train['originalLanguage'].value_counts()

# Create a list of languages with occurrences more than 20
valid_languages = lang_counts[lang_counts > 20].index.tolist()

In [None]:
# Group data by 'language' and 'sentiment', then count the occurrences
language_sentiment_counts = merged_train.groupby(['originalLanguage', 'sentiment_target']).size().unstack(fill_value=0)

# Set the size of the plots
fig, axs = plt.subplots(len(valid_languages), 1, figsize=(5,5))

for idx, language in enumerate(valid_languages):
    language_counts = language_sentiment_counts.loc[language]
    
    # Create subplots
    ax = axs[idx]
    
    # Plot bar chart
    language_counts.plot(kind='bar', rot=0, ax=ax, title=f'Sentiment Distribution for {language}')
    ax.set_xlabel('Sentiment')
    ax.set_ylabel('Count')
    ax.set_xticks(range(len(language_counts)))
    ax.set_xticklabels(language_counts.index)

# Adjust layout
# plt.tight_layout()
plt.show()

In [None]:
# Group data by 'language' and 'sentiment', then count the occurrences
language_sentiment_counts = merged_train.groupby(['originalLanguage', 'sentiment_target']).size().unstack(fill_value=0)

# Plot bar charts for each language showing the distribution of sentiments
for language in valid_languages:
    language_counts = language_sentiment_counts.loc[language]
    language_counts.plot(kind='bar', rot=0, title=f'Sentiment Distribution for {language}')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.xticks(range(len(language_counts)), language_counts.index)
    plt.show()

In [None]:
# Calculate percentage distribution
language_sentiment_percentage = language_sentiment_counts.div(language_sentiment_counts.sum(axis=1), axis=0) * 100

# Display the percentage distribution DataFrame
print(language_sentiment_percentage.shape)

# Locate languages with percentage of positive sentiment more than 66%
languages_above_66_positive = language_sentiment_percentage[
    language_sentiment_percentage[1] > 80
]

# print(languages_above_66_positive.shape)

In [None]:
print(merged_train.columns)

In [None]:
merged_train.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

audience_score = merged_train['audienceScore']
runtime = merged_train['runtimeMinutes']
lang = merged_train['originalLanguage']
name = merged_train['reviewerName']

labels = merged_train['sentiment_target']
#Encoded label to 0 and 1
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

#One-Hot-Encoding for originalLanguage column
ohe_lang = pd.get_dummies(lang)
#One-Hot-Encoding for reviewerName column
ohe_name = pd.get_dummies(name)

#Calculated correlation between 'audienceScore' and sentiment labels
correlation_score = audience_score.corr(pd.Series(labels))

#Calculated correlation between 'runningTime' and sentiment labels
correlation_runtime = runtime.corr(pd.Series(labels))

#Calculated correlation between 'originalLanguage' and sentiment labels
correlation_lang = pd.DataFrame(ohe_lang).corrwith(pd.Series(labels))

#Calculated correlation between 'originalLanguage' and sentiment labels
correlation_name = pd.DataFrame(ohe_name).corrwith(pd.Series(labels))


# Step 3: Visualize the correlation
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.bar(['audienceScore','originalLanguage','runTime', 'reviewerName'], [correlation_score, correlation_lang[0], correlation_runtime, correlation_name[0]])
plt.xlabel('Features')
plt.ylabel('Correlation with Sentiment')
plt.title('Correlation between Features and Sentiment')
plt.show()

# Compare the correlation values

# print("Correlation with 'sentiment' for audienceScore:", correlation_score)
# print("Correlation with 'sentiment' for originalLanguage:", correlation_lang)
# print("Correlation with 'sentiment' for runningTime:", correlation_runtime)
# print("Correlation with 'sentiment' for reviewerName:", correlation_name)

#### After analysing the performance of each of the individual features in the merged_train dataset, following are the reasons why some columns were kept and some were removed:

* **movieid** : Will be used for merging the one hot encoded datasets
* **reviewText** : kept for sentiment analysis
* **originalLanguage** : score = 0.67141, however, kept for reasons explained
* **director** : score = 0.71909, kept as score is sufficiently high


* **reviewerName** : score = 0.6734, hence removed
* **title** : redundant as it is the same as movieid
* **releaseDateStreaming** : removed as it was giving less score

In [None]:
merged_train.drop(['title','reviewerName','releaseDateStreaming'], axis = 1, inplace = True)
merged_test.drop(['title','reviewerName','releaseDateStreaming'], axis = 1, inplace = True)

In [None]:
# for i in ['director', 'originalLanguage', 'movieid']:
#     print(merged_train[i].nunique())
#     print(merged_test[i].nunique())

In [None]:
print(merged_train.shape)
print(merged_test.shape)

In [None]:
merged_train.head()

In [None]:
# pd.set_option('display.max_rows', None)

In [None]:
merged_train.columns

In [None]:
# language_counts = merged_train['originalLanguage'].value_counts()
# valid_languages = language_counts[language_counts > 1]
# # print(valid_languages)
# print(type(valid_languages))

In [None]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Create a heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(merged_train.corr(), annot=False, cmap='coolwarm')
# plt.title('Heatmap of the Dataset')
# plt.show()

In [None]:
# This reduced the accuracy, hence was not utilised in the final model

# # Define the threshold for infrequent languages
# threshold = 51

# # Find the list of infrequent languages
# infrequent_languages = merged_train['originalLanguage'].value_counts()[merged_train['originalLanguage'].value_counts() < threshold].index.tolist()

# # Replace the infrequent languages with the label 'other'
# merged_train['originalLanguage'] = merged_train['originalLanguage'].apply(lambda lang: 'other' if lang in infrequent_languages else lang)

### One Hot Encoding the categorical Columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

director_ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = True)
language_ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = True)
movieid_ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = True)

# Fit and transform the encoder on the "director" column
encoded_director = director_ohe.fit_transform(merged_train[['director']])
encoded_director_test = director_ohe.transform(merged_test[['director']])

encoded_language = language_ohe.fit_transform(merged_train[['originalLanguage']])
encoded_language_test = language_ohe.transform(merged_test[['originalLanguage']])

encoded_movieid_train = movieid_ohe.fit_transform(merged_train[['movieid']])
encoded_movieid_test = movieid_ohe.transform(merged_test[['movieid']])

In [None]:
print('encoded_director.shape: ',encoded_director.shape)
print(type(encoded_director))

print('encoded_language.shape: ',encoded_language.shape)
print(type(encoded_director))

print('encoded_movieid_train.shape: ',encoded_movieid_train.shape)
print(type(encoded_movieid_train))

print()

print('encoded_director_test.shape: ',encoded_director_test.shape)
print(type(encoded_director_test))

print('encoded_language_test.shape: ',encoded_language_test.shape)
print(type(encoded_director_test))

print('encoded_movieid_test.shape: ',encoded_movieid_test.shape)
print(type(encoded_movieid_test))

In [None]:
# director_vals = movies_data['director'].value_counts()
# valid_directors = director_vals[director_vals > 3].index
# print(len(director_vals[director_vals > 3]))

# movies_data['director'] = movies_data['director'].apply(lambda x: x if x in valid_directors else 'director')

#### Separating target and features

In [None]:
y = merged_train.sentiment_target
merged_train.drop(columns = ['sentiment_target'], axis = 1, inplace = True)
y.head()

In [None]:
train_movie_ids = set(merged_train['movieid'])
test_movie_ids = set(merged_test['movieid'])

# Find movie IDs present in test but not in train
missing_movie_ids = test_movie_ids - train_movie_ids
print(len(missing_movie_ids))

# print("Movie IDs present in test but not in train:", missing_movie_ids)

## Feature Extraction

### Using tfidf to convert the reviews columns to vectorised text

Trials and errors:
ngram_range = (2,2), max_df = 0.95 - memory error

In [None]:
print(merged_train.shape)
print(merged_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
tfidf_train = tfidf_vectorizer.fit_transform(merged_train['reviewText'])
tfidf_test = tfidf_vectorizer.transform(merged_test['reviewText'])

In [None]:
print(tfidf_train.shape)
print(tfidf_test.shape)

In [None]:
merged_train.drop(columns = ['director'], axis = 1, inplace = True)
merged_test.drop(columns = ['director'], axis = 1, inplace = True)

merged_train.drop(columns = ['originalLanguage'], axis = 1, inplace = True)
merged_test.drop(columns = ['originalLanguage'], axis = 1, inplace = True)

merged_train.drop(columns = ['movieid'], axis = 1, inplace = True)
merged_test.drop(columns = ['movieid'], axis = 1, inplace = True)

merged_train.drop(columns = ['reviewText'], axis=1, inplace = True)
merged_test.drop(columns = ['reviewText'], axis=1, inplace = True)

In [None]:
merged_test.head()

In [None]:
merged_train.head()

In [None]:
import scipy.sparse as sp

# Convert DataFrame to a CSR sparse matrix
sparse_merged_train = sp.csr_matrix(merged_train)
sparse_merged_test = sp.csr_matrix(merged_test)

print('sparse_merged_train.shape: ', sparse_merged_train.shape)
print('sparse_merged_test.shape: ', sparse_merged_test.shape)

In [None]:
merged_sparse_train = sp.hstack((sparse_merged_train, encoded_director, encoded_language, encoded_movieid_train, tfidf_train), format='csr')

In [None]:
merged_sparse_test = sp.hstack((sparse_merged_test, encoded_director_test, encoded_language_test, encoded_movieid_test, tfidf_test), format='csr')

In [None]:
print(merged_sparse_train.shape)
print(y.shape)

# Beginning the Model Making

A few Models are commented out as they are time-taking to run and are not the part of the final submitted model.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(merged_sparse_train, y, test_size = 0.2, random_state = 42)

## Logistic Regression - Baseline

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_logit = SGDClassifier(loss='log_loss')
sgd_logit.fit(X_train, y_train)

In [None]:
y_pred = sgd_logit.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## SVM Classifier - Baseline

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_logit = SGDClassifier(loss='hinge')
sgd_logit.fit(X_train, y_train)

In [None]:
y_pred_hinge = sgd_logit.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_hinge))

## Linear SVC - Baseline

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.svm import LinearSVC
# from sklearn.metrics import accuracy_score

In [None]:
# linear_svc_model = LinearSVC()
# linear_svc_model.fit(X_train, y_train)
# y_pred_svc = linear_svc_model.predict(X_test)

In [None]:
# print(classification_report(y_test, y_pred_svc))

## KNN - Baseline

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# knn_model = KNeighborsClassifier(n_neighbors = 300)

# knn_model.fit(X_train, y_train)

# predictions = knn_model.predict(X_test)

In [None]:
# print(classification_report(y_test, predictions))
# accuracy = 0.79

## Decision Tree Algorithm - Baseline

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# tree = DecisionTreeClassifier()
# tree.fit(X_train, y_train)
# y_tree_pred = tree.predict(X_test)

# print(classification_report(y_tree_pred, y_test))

## MLP - Baseline

In [None]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', max_iter=500, random_state=42)

# # Train the classifier
# mlp_classifier.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred_mlp = mlp_classifier.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred_mlp)
# confusion = confusion_matrix(y_test, y_pred_mlp)
# classification_rep = classification_report(y_test, y_pred_mlp)

# print("Accuracy:", accuracy)
# print("Confusion Matrix:\n", confusion)
# print("Classification Report:\n", classification_rep)

### LinearSVC and SGDLogistic Models were found to give the best accuracy, hence they will be optimised.

## SGDClassifier with HPT

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

# Create a SelectKBest instance with f_classif scoring
kbest = SelectKBest(score_func=chi2, k = 50000)

# Create a classifier instance (e.g., SVM)
classifierSGD = SGDClassifier(loss = 'log_loss', alpha = 0.0001, max_iter = 50000)

# Create a pipeline with SelectKBest and the classifier
pipeline = Pipeline([
    ('feature_selection', kbest),
    ('classifier', classifierSGD)
])


pipeline.fit(X_train, y_train)

y_pred_logit = pipeline.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_logit))
    
# # Print the best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

In [None]:
X_test.shape

## LinearSVC With HPT

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create a SelectKBest instance with f_classif scoring
kbest = SelectKBest(score_func=chi2, k = 50000)

# Create a classifier instance (e.g., SVM)
classifierSVC = LinearSVC(penalty='l2', loss='hinge', tol=0.0001, C=1.0, random_state=None, max_iter=50000)

# Create a pipeline with SelectKBest and the classifier
pipeline = Pipeline([
    ('feature_selection', kbest),
    ('classifier', classifierSVC)
])


pipeline.fit(X_train, y_train)

y_pred_svc = pipeline.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_svc))
    
# # Print the best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

### HPT of linearSVC

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.feature_selection import SelectKBest, f_classif, chi2
# from sklearn.linear_model import LinearSVC

# # Create a SelectKBest instance with f_classif scoring
# kbest = SelectKBest(score_func=chi2)

# # Create a classifier instance (e.g., SVM)
# classifierSVC = linearSVC

# # Create a pipeline with SelectKBest and the classifier
# pipeline = Pipeline([
#     ('feature_selection', kbest),
#     ('classifier', classifierSVC)
# ])

# # Define the parameter grid for tuning
# param_grid = {
#     'feature_selection__k': [500, 1000, 5000, 10000, 50000],  # List of values for k
#     'classifier__penalty': ['l1', 'l2'],# Penalty for the classifier (e.g., for SVM)
#     'classifier__loss': ['hinge', 'squared-hinge'],# Penalty for the classifier (e.g., for SVM)
#     'classifier__max_iter': [5000, 10000, 50000],
#     'classifier__tol' : [0.0001, 0.001, 0.01],
#     'classifier__C' : [0.1, 1, 10, 100]# Maximum number of iterations for the classifier
# }

# # Create a GridSearchCV instance
# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)

# # Fit the GridSearchCV on your data
# grid_search.fit(X_train, y_train)

In [None]:
# # Print the best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import LearningCurveDisplay, ShuffleSplit

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6), sharey=True)

common_params = {
    "X": X_train,
    "y": y_train,
    "train_sizes": np.linspace(0.1, 1.0, 5),
    "score_type": "both",
    "n_jobs": 4,
    "line_kw": {"marker": "o"},
    "std_display_style": "fill_between",
    "score_name": "Accuracy",
}

for ax_idx, estimator in enumerate([classifierSGD, classifierSVC]):
    LearningCurveDisplay.from_estimator(estimator, **common_params, ax=ax[ax_idx])
    handles, label = ax[ax_idx].get_legend_handles_labels()
    ax[ax_idx].legend(handles[:2], ["Training Score", "Test Score"])
    ax[ax_idx].set_title(f"Learning Curve for {estimator.__class__.__name__}")

In [None]:
test_preds = pipeline.predict(merged_sparse_test)

# Convert the NumPy array to a DataFrame
output = pd.DataFrame(test_preds, columns=['sentiment'])

# Map the numeric labels to 'POSITIVE' and 'NEGATIVE'
output['sentiment'] = output['sentiment'].apply(lambda x: 'POSITIVE' if x == 1 else 'NEGATIVE')

# Add the 'id' column from the test data
output['id'] = merged_test.index

# Reorder columns
output = output[['id', 'sentiment']]

# Save the DataFrame to a CSV file
output.to_csv('submission.csv', index=False)

In [None]:
output.shape