#  TASK - 2 : MOVIE RATING PREDICTION

Author : Amaan

Domain : Data Science

Batch : August-2024



# IMPORTING LIBRARIES AND READING THE DATASET :-

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [32]:
# UPLOADING THE FILE...

from google.colab import files
uploaded = files.upload()


Saving IMDb Movies India.csv to IMDb Movies India (2).csv


In [None]:
# READING THE CSV FILE...

import io
data= pd.read_csv(io.BytesIO(uploaded['IMDb Movies India.csv']),encoding = 'unicode_escape')


In [33]:
data.head(10)

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,1997,147,Musical,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,2005,142,Drama,7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
6,2005,142,Romance,7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
6,2005,142,War,7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,2012,82,Horror,5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [34]:
data.shape

(11979, 9)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


# DATA CLEANING :-

In [None]:
data.isnull().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [None]:
data.duplicated().sum()

6

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

(5659, 10)

In [None]:
data.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

# DATA PRE-PROCESSING :-

In [None]:
# Replacing the brackets from your column :-
data['Year'] = data['Year'].astype(str).str.replace(r'[()]','',regex=True).astype(int)

In [None]:
# Remove the min word from 'Duration' cloumn and convert all values to numeric :-
data['Duration'] = data['Duration'].astype(str).str.replace(r' min','').astype(int)

In [None]:
# Splitting the Genre by, to keep unique genres and replace null values with mode :-
data['Genre'] = data['Genre'].str.split(', ')
data = data.explode('Genre')
data['Genre'].fillna(data['Genre'].mode()[0], inplace=True)

In [None]:
# Convert 'Votes' to numeric and repalcing commas to keep only numeric part :-
data['Votes'] = pd.to_numeric(data['Votes'].str.replace(',',''))

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11979 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      11979 non-null  object 
 1   Year      11979 non-null  int64  
 2   Duration  11979 non-null  int64  
 3   Genre     11979 non-null  object 
 4   Rating    11979 non-null  float64
 5   Votes     11979 non-null  int64  
 6   Director  11979 non-null  object 
 7   Actor 1   11979 non-null  object 
 8   Actor 2   11979 non-null  object 
 9   Actor 3   11979 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 1.0+ MB


# DATA VISUALIZATION :-

In [None]:
# CREATING A HISTOGRAM TO EXAMINE DATA OVER THE YEARS...

data_sorted = data.sort_values('Year')
year = px.histogram(data_sorted, x='Year', title='Distribution of Movies Over the Years', nbins=30, labels={'Year': 'Year of Release'}, template='plotly_white')

year.update_layout(title_x=0.5)
year.update_layout(xaxis_title='Year of Release',yaxis_title='Number of Movies',font=dict(family='Arial', size=14),bargap=0.1)

year.show()

In [None]:
# CALCULATING MIN, MAX, AND AVERAGE MOVIES RELEASED PER YEAR...

min_movies = data_sorted.groupby('Year')['Year'].count().min()
max_movies = data_sorted.groupby('Year')['Year'].count().max()
avg_movies = data_sorted.groupby('Year')['Year'].count().mean()

print(f"Minimum movies released in a year: {min_movies}")
print(f"Maximum movies released in a year: {max_movies}")
print(f"Average movies released per year: {avg_movies:.2f}")


Minimum movies released in a year: 2
Maximum movies released in a year: 423
Average movies released per year: 131.64


In [None]:
# CALCULATING AVERAGE RATING BY YEAR FOR TOP GENRES...

avg_rating_by_year = data.groupby(['Year', 'Genre'])['Rating'].mean().reset_index()

top_genres = data['Genre'].value_counts().head(10).index

average_ratings = avg_rating_by_year[avg_rating_by_year['Genre'].isin(top_genres)]

fig = px.line(average_ratings, x='Year', y='Rating', color='Genre', title='Average Movie Rating by Year for Top Genres')
fig.update_layout(title_text='Average Movie Rating by Year for Top Genres', title_x=0.5, title_y=0.95)
fig.update_layout(xaxis_title='Year', yaxis_title='Average Rating', font=dict(family='Arial', size=14), legend_title_text='Genre')
fig.show()

In [None]:
# SHOWING THE DISTRIBUTION OF RATINGS AND ITS PROBABILITY DENSITY USING HISTOGRAM...

fig = px.histogram(data, x='Rating', histnorm='probability density', title='Distribution of Movie Ratings', nbins=20, labels={'Rating': 'Movie Rating'}, template='plotly_white')

fig.update_layout(title_x=0.5)
fig.update_layout(xaxis_title='Movie Rating', yaxis_title='Probability Density', font=dict(family='Arial', size=14), bargap=0.1)

fig.show()


# FEATURE ENGINEERING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [36]:
print(data.columns)

Index(['Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Actor 3'],
      dtype='object')


In [37]:
genre_mean_ratings = data.groupby('Genre')['Rating'].transform('mean')
data['Genre_Mean_Rating'] = genre_mean_ratings

director_mean_ratings = data.groupby('Director')['Rating'].transform('mean')
data['Director_Mean_Rating'] = director_mean_ratings

actor1_mean_ratings = data.groupby('Actor 1')['Rating'].transform('mean')
data['Actor1_Mean_Rating'] = actor1_mean_ratings

actor2_mean_ratings = data.groupby('Actor 2')['Rating'].transform('mean')
data['Actor2_Mean_Rating'] = actor2_mean_ratings

actor3_mean_ratings = data.groupby('Actor 3')['Rating'].transform('mean')
data['Actor3_Mean_Rating'] = actor3_mean_ratings

In [38]:
# MAKING THE PREDICTOR AND TARGET VARIABLE...

X = data[['Year', 'Duration', 'Genre_Mean_Rating', 'Director_Mean_Rating', 'Actor1_Mean_Rating', 'Actor2_Mean_Rating', 'Actor3_Mean_Rating']]
y = data['Rating']

In [39]:
# SPLITTING THE DATASET INTO TRAINING AND TESTING...

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BULIDING THE MODEL :-

In [40]:
ML = LinearRegression()
ML.fit(X_train, y_train)
ML_pred = ML.predict(X_test)

In [43]:
# PERFORMANCE EVALUATION OF THE MODE...

print('The performance of the model is as follows:')
print('Mean Squared Error: ', mean_squared_error(y_test, ML_pred))
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(y_test, ML_pred)))
print('R-squared:', r2_score(y_test, ML_pred))

The performance of the model is as follows:
Mean Squared Error:  0.45491154926519856
Root Mean Squared Error:  0.6744713109281955
R-squared: 0.7596933018879729


# TESTING THE MODEL :-

In [45]:
X.head(5)

Unnamed: 0,Year,Duration,Genre_Mean_Rating,Director_Mean_Rating,Actor1_Mean_Rating,Actor2_Mean_Rating,Actor3_Mean_Rating
1,2019,109,6.056744,7.0,6.85,7.0,7.0
3,2019,110,5.751042,4.4,5.25,4.4,4.46
3,2019,110,5.811087,4.4,5.25,4.4,4.46
5,1997,147,5.751042,5.335135,4.793617,5.73,5.93
5,1997,147,6.056744,5.335135,4.793617,5.73,5.93


In [46]:
y.head(5)

Unnamed: 0,Rating
1,7.0
3,4.4
3,4.4
5,4.7
5,4.7


In [52]:
# WE CREATE NEW DATAFRAME FOR TESTING...

data = {'Year': [2023], 'Duration':[111], 'Genre_Mean_Rating': [5.8], 'Director_Mean_Rating': [4.5],
        'Actor1_Mean_Rating': [5.3], 'Actor2_Mean_Rating': [6.2], 'Actor3_Mean_Rating': [7.1]}
new_data = pd.DataFrame(data)

In [55]:
# PREDICT MOVIE RATING BY ENTERING DATA...

predicted_rating = ML.predict(new_data)
print('Predicted Rating:', predicted_rating[0])

Predicted Rating: 5.641133173885087
