# Task 2 - Movie Rating Prediction

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## 1. Read Data and Basic Data Quality Check

In [2]:
import pandas as pd
df = pd.read_csv('IMDb Movies India.csv', encoding='latin-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [3]:
# check row/column count, column names, column-wise not null value count, column data type 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [4]:
# check if any duplicate entries
df[df.duplicated()]

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1250,Arab Ka Sona - Abu Kaalia,(1979),,Action,,,Master Bhagwan,Meena Rai,Dara Singh,
1769,Balidan,(1992),,Drama,,,,,,
4723,First Time - Pehli Baar,(2009),,,,,Raja Bundela,Zeenat Aman,Nitin Arora,Raj Babbar
9713,Musafir,,,Thriller,,,Shiva Dagar,,,
13069,Shivani,(2019),,Crime,,,Ugresh Prasad Ujala,Santosh,,
13308,Slumdog Karodpati,(2019),118 min,Thriller,,,Rajesh Patole,Udhav Garje,Rahul Gavane,Govindrao


In [5]:
# delete duplicated rows identified
df.drop_duplicates(inplace=True)

In [6]:
# verfiy if duplicates removed
df.duplicated().sum()

0

### Dropping statistically insignificant columns

In [7]:
df.drop(columns=['Name', 'Duration'], axis=1, inplace=True)
df.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,(2019),Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,(2021),"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,(2019),"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,(2010),Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


### Handling missing values

In [8]:
# check missing value count
df.isna().sum()

Year         527
Genre       1876
Rating      7584
Votes       7583
Director     524
Actor 1     1615
Actor 2     2381
Actor 3     3140
dtype: int64

#### replacing missing Rating with average Rating

In [9]:
m = df['Rating'].mean()
df['Rating'].fillna(m, inplace=True)

#### replacing other column missing values using mode

In [10]:
cat = list(df.columns[df.dtypes=='object'])

In [11]:
for i in cat:
    md = df[i].mode()[0]
    df[i].fillna(md, inplace=True)

In [12]:
df.isna().sum()

Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

### Conveert Year and Votes datatype to integer

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15503 entries, 0 to 15508
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      15503 non-null  object 
 1   Genre     15503 non-null  object 
 2   Rating    15503 non-null  float64
 3   Votes     15503 non-null  object 
 4   Director  15503 non-null  object 
 5   Actor 1   15503 non-null  object 
 6   Actor 2   15503 non-null  object 
 7   Actor 3   15503 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.1+ MB


In [14]:
# Convert Year column to integer
df['Year'] = df['Year'].str.strip('()').astype(int)

In [15]:
# Convert Votes column to integer
df['Votes'] = df['Votes'].str.replace(',','')
df['Votes'] = df['Votes'].str.replace('.','')
df['Votes'] = df['Votes'].str.strip('$')
df['Votes'] = df['Votes'].str.strip('M')
df['Votes'] = df['Votes'].astype(int)

In [16]:
df.nunique()

Year         102
Genre        485
Rating        85
Votes       2034
Director    5938
Actor 1     4718
Actor 2     4891
Actor 3     4820
dtype: int64

### We can see that 5 of the columns have High Cardinality. So we will be using Target encoding calculation group aggregation (mean) and replacing with mean values.
### 5 columns --> Votes, Director, Actor 1, Actor 2. Actor 3

In [17]:
df['Genre Encoded'] = round(df.groupby('Genre')['Rating'].transform('mean'),1)
df['Votes Encoded'] = round(df.groupby('Votes')['Rating'].transform('mean'),1)
df['Director Encoded'] = round(df.groupby('Director')['Rating'].transform('mean'),1)
df['Actor 1 Encoded'] = round(df.groupby('Actor 1')['Rating'].transform('mean'),1)
df['Actor 2 Encoded'] = round(df.groupby('Actor 2')['Rating'].transform('mean'),1)
df['Actor 3 Encoded'] = round(df.groupby('Actor 3')['Rating'].transform('mean'),1)

In [18]:
df.drop(columns=['Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)
df['Rating'] = round(df['Rating'],1)
df.head()

Unnamed: 0,Year,Rating,Genre Encoded,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 2 Encoded,Actor 3 Encoded
0,2019,5.8,6.0,5.8,5.8,5.8,5.5,5.8
1,2019,7.0,6.0,5.8,7.0,6.8,7.0,7.0
2,2021,5.8,6.3,5.8,5.8,6.2,6.8,5.8
3,2019,4.4,5.7,5.9,4.4,5.4,4.4,4.4
4,2010,5.8,6.0,5.8,6.3,6.8,5.8,5.5


## 2. Separate Independent (X) and Target (Y) features 

In [19]:
# Separate Independent (X) and Target (Y) features
X = df.drop(columns=['Rating'])
Y = df['Rating']

In [20]:
X.head()

Unnamed: 0,Year,Genre Encoded,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 2 Encoded,Actor 3 Encoded
0,2019,6.0,5.8,5.8,5.8,5.5,5.8
1,2019,6.0,5.8,7.0,6.8,7.0,7.0
2,2021,6.3,5.8,5.8,6.2,6.8,5.8
3,2019,5.7,5.9,4.4,5.4,4.4,4.4
4,2010,6.0,5.8,6.3,6.8,5.8,5.5


In [21]:
Y.head()

0    5.8
1    7.0
2    5.8
3    4.4
4    5.8
Name: Rating, dtype: float64

## 3. Create Pipelie for Feature Selection

In [22]:
# Feature Selection
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
modelf = LinearRegression()
sel = SequentialFeatureSelector(estimator=modelf, n_features_to_select='auto', direction='backward')
sel.fit_transform(X, Y)
sel_col = sel.get_feature_names_out()

In [23]:
sel_col

array(['Votes Encoded', 'Director Encoded', 'Actor 1 Encoded',
       'Actor 3 Encoded'], dtype=object)

In [24]:
# Getting important columns in a list
imp_col = []
for i in sel_col:
    imp_col.append(i)

In [25]:
imp_col

['Votes Encoded', 'Director Encoded', 'Actor 1 Encoded', 'Actor 3 Encoded']

In [26]:
X_sel = X[imp_col]
X_sel.head()

Unnamed: 0,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 3 Encoded
0,5.8,5.8,5.8,5.8
1,5.8,7.0,6.8,7.0
2,5.8,5.8,6.2,5.8
3,5.9,4.4,5.4,4.4
4,5.8,6.3,6.8,5.5


## 3. Apply Train-Test Split

In [27]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_sel, Y, test_size=0.25, random_state=42)

In [28]:
xtrain.shape

(11627, 4)

In [29]:
xtest.shape

(3876, 4)

In [30]:
xtrain.head()

Unnamed: 0,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 3 Encoded
6089,5.8,5.8,6.3,5.6
428,6.6,6.2,6.4,6.2
13190,5.8,5.8,5.8,5.9
14122,5.8,5.8,5.8,5.8
15226,6.1,5.8,6.8,5.9


In [31]:
ytrain.head()

6089     5.8
428      6.6
13190    5.8
14122    5.8
15226    6.8
Name: Rating, dtype: float64

In [32]:
xtest.head()

Unnamed: 0,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 3 Encoded
11935,5.3,6.0,4.9,6.1
11402,6.3,5.6,5.3,3.0
4924,5.8,5.9,5.9,6.0
15257,5.8,5.8,5.8,5.8
6708,5.8,5.8,5.8,5.9


In [33]:
ytest.head()

11935    6.1
11402    3.6
4924     5.8
15257    5.8
6708     5.8
Name: Rating, dtype: float64

## 4. Model Building and Evaluation

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# we are not evaluating KNN and SVM as the dataset has >2000 data rows

In [35]:
dct = {'Linear' : LinearRegression(),
       'Decision Tree' : DecisionTreeRegressor(),
       'Random Forest' : RandomForestRegressor(),
       'Gradient Boosting' : GradientBoostingRegressor()}

In [36]:
dct.items()

dict_items([('Linear', LinearRegression()), ('Decision Tree', DecisionTreeRegressor()), ('Random Forest', RandomForestRegressor()), ('Gradient Boosting', GradientBoostingRegressor())])

In [37]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
Train_MSE = []
Test_MSE = []
Train_R2 = []
Test_R2 = []
Train_CV_R2 = []
for name, model in dct.items():
    # Fit the model
    model.fit(xtrain, ytrain)
    # Predict results for Train and Test
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(xtest)
    # Get MSE, R2 score
    mse_train = mean_squared_error(ytrain, ypred_train)
    mse_test = mean_squared_error(ytest, ypred_test)
    r2_train = r2_score(ytrain, ypred_train)
    r2_test = r2_score(ytest, ypred_test)
    # Get Cross validated scores
    scores_r2 = cross_val_score(model, xtrain, ytrain, cv=5, scoring = 'r2')
    r2_cv = scores_r2.mean()
    # Append above results in list
    Train_MSE.append(mse_train)
    Test_MSE.append(mse_test)
    Train_R2.append(r2_train)
    Test_R2.append(r2_test)
    Train_CV_R2.append(r2_cv)

In [38]:
# Create dictionary with the metrics results
res = {'Name':list(dct.keys()),
       'Train_MSE':Train_MSE,
       'Test_MSE':Test_MSE,
       'Train_R2':Train_R2,
       'Test_R2':Test_R2,
       'Train_CV_R2':Train_CV_R2}

In [39]:
# Create a data frame with above results
df_res = pd.DataFrame(res)
df_res.sort_values('Train_CV_R2', ascending=False)

Unnamed: 0,Name,Train_MSE,Test_MSE,Train_R2,Test_R2,Train_CV_R2
3,Gradient Boosting,0.205884,0.223606,0.787657,0.774873,0.769096
2,Random Forest,0.042362,0.252217,0.956309,0.746068,0.75647
0,Linear,0.262875,0.258164,0.728879,0.74008,0.727546
1,Decision Tree,0.014424,0.465139,0.985123,0.531698,0.556314


### Based on above scores, we can say that Random Forest is performing better than other models

In [40]:
params = {'n_estimators':[100, 200],
          'max_depth':[3, 4, 5, 6],
          'min_samples_split':[5, 6, 7, 8],
          'criterion':['squared_error','absolute_error']}

In [41]:
# Using RandomizedSearchCV
rfr = RandomForestRegressor()
rscv = RandomizedSearchCV(rfr, param_distributions=params, cv=5, scoring='neg_mean_squared_error')
rscv.fit(xtrain, ytrain)

In [42]:
rscv.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'max_depth': 6,
 'criterion': 'squared_error'}

In [43]:
best_rfr = rscv.best_estimator_
best_rfr

### Evaluate the Model

In [44]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [45]:
def eval_model(model,xtrain,ytrain):
    # Fit the model
    model.fit(xtrain,ytrain)
    # Predict the scores
    ypred_test = model.predict(xtest)
    # Calculate MSE,RMSE,MAE,R2 scores
    mse = mean_squared_error(ytest,ypred_test)
    rmse = mse**(1/2)
    mae = mean_absolute_error(ytest,ypred_test)
    r2 = r2_score(ytest,ypred_test)
    return mse,rmse,mae,r2

In [46]:
(MSE,RMSE,MAE,r2) = eval_model(best_rfr,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.2560445748482223
RMSE:0.506008473099238
MAE:0.2868106800619718
R2:0.7422142461649927


## 6. Let's create XGBoost if the scores improve

In [47]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [48]:
from xgboost import XGBRegressor

In [49]:
model = XGBRegressor()
model.fit(xtrain,ytrain)

In [50]:
model.score(xtrain,ytrain)

0.8838348635350373

In [51]:
model.score(xtest,ytest)

0.7603417780879316

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
params1 = {'n_estimators':[200,300,500,600,800,1000],
          'learning_rate':[0.05,0.1,0.2,0.3],
          'max_depth':[5,6,7,8,9,10],
          'min_child_weight':[1,2,3],
          'objective':['reg:squarederror'],
          'gamma':[0.1,0.2,0.3,0.4]}

In [54]:
gscv1 = GridSearchCV(model,params1,scoring='neg_mean_squared_error',cv=5)
gscv1.fit(xtrain,ytrain)

In [55]:
gscv1.best_params_

{'gamma': 0.1,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 3,
 'n_estimators': 500,
 'objective': 'reg:squarederror'}

In [56]:
best_xgb1 = gscv1.best_estimator_
best_xgb1

In [57]:
best_xgb1.score(xtrain,ytrain)

0.8283468333584629

In [58]:
best_xgb1.score(xtest,ytest)

0.7818532921778888

In [59]:
params2 = {'subsample':[0.5,0.6,0.7,0.8,0.9,1],
           'colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1]}

In [60]:
gscv2 = GridSearchCV(best_xgb1,params2,cv=5,scoring='neg_mean_squared_error')
gscv2.fit(xtrain,ytrain)

In [61]:
gscv2.best_params_

{'colsample_bytree': 0.8, 'subsample': 1}

In [62]:
best_xgb2 = gscv2.best_estimator_
best_xgb2

In [63]:
best_xgb2.score(xtrain,ytrain)

0.8222012973196333

In [64]:
best_xgb2.score(xtest,ytest)

0.7861891082904899

### Evalute XGBoost

In [65]:
(MSE,RMSE,MAE,r2) = eval_model(best_xgb2,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.21236673497760378
RMSE:0.4608326539836384
MAE:0.2600414836062243
R2:0.7861891082904899


### Conclusion: Based on the abive scores, we can say that XGBoost is performing better and can be use for prediction.

## 7. Model Prediction --> Compared ytest and predicted ytest

ypred_test = best_xgb2.predict(xtest)
ypred_test[o:10]

In [66]:
ytest.head(10)

11935    6.1
11402    3.6
4924     5.8
15257    5.8
6708     5.8
7190     5.8
10872    6.7
3100     5.8
3316     7.2
7767     4.2
Name: Rating, dtype: float64

In [67]:
df_final = xtest
df_final['Rating_Pred'] = ypred_test
df_final

Unnamed: 0,Votes Encoded,Director Encoded,Actor 1 Encoded,Actor 3 Encoded,Rating_Pred
11935,5.3,6.0,4.9,6.1,5.515493
11402,6.3,5.6,5.3,3.0,3.930323
4924,5.8,5.9,5.9,6.0,5.794250
15257,5.8,5.8,5.8,5.8,5.788873
6708,5.8,5.8,5.8,5.9,5.793867
...,...,...,...,...,...
3380,5.8,6.2,5.8,6.2,5.994182
9676,5.7,5.3,5.7,5.0,4.854150
1084,5.8,6.9,6.4,5.9,6.503853
9047,5.8,5.8,5.8,5.8,5.788873


### We can see that xtest abd predicted ytest and the predicted ytest are almost the same. Hence, XGBoost is a good model that can be used for preciting the movie rating