# #MOVIE RATING PREDICTION WITH PYTHON

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Load the dataset 

In [2]:
import pandas as pd
df = pd.read_csv('IMDb Movies India.csv',na_values=(' '),encoding='latin-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


### Perform basic data quality checks

In [3]:
df.shape

(15509, 10)

In [4]:
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15508 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [6]:
df.duplicated().sum()

6

In [7]:
# drop the duplicated rows
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
df.isna().sum()

Name           1
Year         527
Duration    8264
Genre       1876
Rating      7584
Votes       7583
Director     524
Actor 1     1615
Actor 2     2381
Actor 3     3140
dtype: int64

In [9]:
df.drop(columns=['Name','Duration'],axis=1,inplace=True)

In [10]:
df.shape

(15503, 8)

In [11]:
df.nunique()

Year         102
Genre        485
Rating        84
Votes       2034
Director    5938
Actor 1     4718
Actor 2     4891
Actor 3     4820
dtype: int64

In [12]:
cat = list(df.columns[df.dtypes=='object'])

In [13]:
mn = df['Rating'].mean()
df['Rating'].fillna(mn,inplace=True)

In [14]:
for i in cat:
    m = df[i].mode()[0]
    df[i].fillna(m,inplace=True)

In [15]:
df.isna().sum()

Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [16]:
# Lets convert Date column from string to int
df['Year']= df['Year'].str.strip('()').astype(int)
# Lets convert Votes column from string to int
df['Votes'] = df['Votes'].str.replace(',','')
df['Votes'] = df['Votes'].str.replace('.','')
df['Votes'] = df['Votes'].str.strip('$')
df['Votes'] = df['Votes'].str.strip('M')
df['Votes'] = df['Votes'].astype(int)

In [17]:
df.nunique()

Year         102
Genre        485
Rating        85
Votes       2034
Director    5938
Actor 1     4718
Actor 2     4891
Actor 3     4820
dtype: int64

### Encoding Categorical features using Target encoding-Mean encoding

In [18]:
## performing mean encoding
df['Genre encoded'] = round(df.groupby('Genre')['Rating'].transform('mean'),1)
df['Votes encoded'] = round(df.groupby('Votes')['Rating'].transform('mean'),1)
df['Director encoded'] = round(df.groupby('Director')['Rating'].transform('mean'),1)
df['Actor 1 encoded'] = round(df.groupby('Actor 1')['Rating'].transform('mean'),1)
df['Actor 2 encoded'] = round(df.groupby('Actor 2')['Rating'].transform('mean'),1)
df['Actor 3 encoded'] = round(df.groupby('Actor 3')['Rating'].transform('mean'),1)

df.drop(columns=['Genre','Votes','Director','Actor 1','Actor 2','Actor 3'],inplace=True)
df['Rating'] = round(df['Rating'],1)
df.head()

Unnamed: 0,Year,Rating,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded
0,2019,5.8,6.0,5.8,5.8,5.8,5.5,5.8
1,2019,7.0,6.0,5.8,7.0,6.8,7.0,7.0
2,2021,5.8,6.3,5.8,5.8,6.2,6.8,5.8
3,2019,4.4,5.7,5.9,4.4,5.4,4.4,4.4
4,2010,5.8,6.0,5.8,6.3,6.8,5.8,5.5


### Separate X and Y features

In [19]:
X = df.drop(columns='Rating')
Y =df[['Rating']]

In [20]:
X.head()

Unnamed: 0,Year,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded
0,2019,6.0,5.8,5.8,5.8,5.5,5.8
1,2019,6.0,5.8,7.0,6.8,7.0,7.0
2,2021,6.3,5.8,5.8,6.2,6.8,5.8
3,2019,5.7,5.9,4.4,5.4,4.4,4.4
4,2010,6.0,5.8,6.3,6.8,5.8,5.5


In [21]:
Y.head()

Unnamed: 0,Rating
0,5.8
1,7.0
2,5.8
3,4.4
4,5.8


### Split the dataset into training data and testing data

In [22]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.25,random_state=21)

In [23]:
xtrain.shape

(11627, 7)

In [24]:
xtest.shape

(3876, 7)

In [25]:
ytrain.shape

(11627, 1)

In [26]:
ytest.shape

(3876, 1)

## Build the model

### Performing Algorithm Evaluaton to check which regression models give best results 

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

In [28]:
dct = {
    'Linear':LinearRegression(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'KNN':KNeighborsRegressor(),
}

In [29]:
dct.items()

dict_items([('Linear', LinearRegression()), ('DecisionTree', DecisionTreeRegressor()), ('RandomForest', RandomForestRegressor()), ('KNN', KNeighborsRegressor())])

In [30]:
train_mse = []
train_r2 = []
test_mse = []
test_r2 = []
train_cv = []

for name,model in dct.items():
    # fit the model
    m = model.fit(xtrain,ytrain)
    ypred_train = m.predict(xtrain)
    ypred_test = m.predict(xtest)
    # calculate MSE
    mse_train = mean_squared_error(ytrain,ypred_train)
    mse_test = mean_squared_error(ytest,ypred_test)
    # caluclate R2
    r2_train = (r2_score(ytrain,ypred_train))*100
    r2_test = (r2_score(ytest,ypred_test))*100
    # calculate cross validated scores
    cv = cross_val_score(m,xtrain,ytrain,cv=5,scoring='r2')
    scores = (cv.mean())*100

    # add these values to the respective list to compare the output
    train_mse.append(mse_train)
    train_r2.append(r2_train)
    test_mse.append(mse_test)
    test_r2.append(r2_test)
    train_cv.append(scores)

    # print the results
    print(f'Scores for {name}')
    print("Training Scores")
    print(f'MSE:{train_mse}')
    print(f'R2:{train_r2}')
    print("Testing Scores")
    print(f'MSE:{test_mse}')
    print(f'R2:{test_r2}')
    print(f'CV:{train_cv}')
    print("===============================")

Scores for Linear
Training Scores
MSE:[0.24292205405189032]
R2:[74.57212477515299]
Testing Scores
MSE:[0.2571004498548369]
R2:[75.18600338447983]
CV:[74.5059875860101]
Scores for DecisionTree
Training Scores
MSE:[0.24292205405189032, 0.00020885295719732806]
R2:[74.57212477515299, 99.97813830878106]
Testing Scores
MSE:[0.2571004498548369, 0.40976816018805184]
R2:[75.18600338447983, 60.45131097283082]
CV:[74.5059875860101, 57.86710387680787]
Scores for RandomForest
Training Scores
MSE:[0.24292205405189032, 0.00020885295719732806, 0.026879260250386842]
R2:[74.57212477515299, 99.97813830878106, 97.186412413436]
Testing Scores
MSE:[0.2571004498548369, 0.40976816018805184, 0.2084252049529661]
R2:[75.18600338447983, 60.45131097283082, 79.88388455480293]
CV:[74.5059875860101, 57.86710387680787, 79.54758240715472]
Scores for KNN
Training Scores
MSE:[0.24292205405189032, 0.00020885295719732806, 0.026879260250386842, 0.17929219919153694]
R2:[74.57212477515299, 99.97813830878106, 97.186412413436, 

In [31]:
res = {'Name':list(dct.keys()),
       'MSE Training Scores':train_mse,
       'MSE Testing Scores':test_mse,
       'R2 Training Scores':train_r2,
       'R2 Testing Scores':test_r2,
       'CV Training Scores':train_cv}

In [32]:
df_res = pd.DataFrame(res)
df_res.sort_values('CV Training Scores',ascending=False)

Unnamed: 0,Name,MSE Training Scores,MSE Testing Scores,R2 Training Scores,R2 Testing Scores,CV Training Scores
2,RandomForest,0.026879,0.208425,97.186412,79.883885,79.547582
0,Linear,0.242922,0.2571,74.572125,75.186003,74.505988
3,KNN,0.179292,0.285027,81.232582,72.490635,70.760266
1,DecisionTree,0.000209,0.409768,99.978138,60.451311,57.867104


#### Lets consider Random Forest Regressor as its giving good results

In [33]:
model = RandomForestRegressor()
model.fit(xtrain,ytrain)

In [34]:
model.score(xtrain,ytrain)

0.9718729138673703

In [35]:
model.score(xtest,ytest)

0.7998058202711013

#### Lets consider Decision Tree Regressor as its giving good results

In [36]:
model2 = DecisionTreeRegressor()
model2.fit(xtrain,ytrain)

In [37]:
model2.score(xtrain,ytrain)

0.9997813830878106

In [38]:
model2.score(xtest,ytest)

0.6012511288611768

## Evaluate the models : Random Forest and Decision Tree

In [39]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [40]:
def eval_model(model,xtrain,ytrain):
    # Fit the model
    model.fit(xtrain,ytrain)
    # Predict the scores
    ypred_test = model.predict(xtest)
    # Calculate MSE,RMSE,MAE,R2 scores
    mse = mean_squared_error(ytest,ypred_test)
    rmse = mse**(1/2)
    mae = mean_absolute_error(ytest,ypred_test)
    r2 = r2_score(ytest,ypred_test)
    return mse,rmse,mae,r2

### Random Forest Evaluation Metrics

In [41]:
(MSE,RMSE,MAE,r2) = eval_model(model,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.20842281504737528
RMSE:0.4565334763709834
MAE:0.2361761736178353
R2:0.7988411521604555


### Decision Tree Evaluation Metrics

In [42]:
(MSE,RMSE,MAE,r2) = eval_model(model2,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.41160252551312926
RMSE:0.6415625655484656
MAE:0.32182232542142897
R2:0.6027426758378269


#### DecisionTree provides the best test score as compared to other models. Considering DecisionTree for final prediction

## Model Prediction

In [43]:
ypred_test = model2.predict(xtest)
ypred_test[:10]

array([5.8, 8.1, 5.5, 2.4, 5.8, 5.8, 5.8, 6.4, 5.8, 6.2])

In [44]:
ytest.head(10)

Unnamed: 0,Rating
3077,5.8
10882,8.1
5424,5.6
4170,1.9
5928,5.8
579,5.8
2406,5.8
12686,6.8
2026,5.8
6573,6.0


In [45]:
df_final = xtest
df_final['Predicted_Rating'] = ypred_test
df_final

Unnamed: 0,Year,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded,Predicted_Rating
3077,1954,6.0,5.8,6.0,5.8,6.6,5.8,5.8
10882,2014,6.2,8.1,8.1,6.9,6.4,5.1,8.1
5424,2000,5.3,5.6,5.0,5.6,5.8,5.1,5.5
4170,2013,5.4,5.4,3.9,1.9,3.5,1.9,2.4
5928,2004,5.6,5.8,5.8,5.8,5.8,5.9,5.8
...,...,...,...,...,...,...,...,...
6870,1988,5.6,5.4,5.0,4.7,5.1,3.3,3.3
13353,1964,5.6,5.7,5.5,5.5,5.6,4.8,4.8
2870,1980,6.0,6.1,8.1,5.6,6.5,5.7,6.4
9524,1995,5.8,5.8,5.8,5.8,5.8,5.8,5.8


### Save the results to dataframe

In [46]:
df_final.to_csv('Predicted Ratings.csv',index=False)

# -------------------------------- Thank You --------------------------------