In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

plt.style.use("ggplot")

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Data acquisition of the movies dataset
df=pd.read_csv('/content/drive/MyDrive/Codsoft/movie rating prediction/movies dataset.csv', encoding='ISO-8859-1').drop(columns='Name')
df.dropna(inplace=True)
df.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [3]:
# Find info about the columns
print(df.shape)
print("-"*60)
print(df.isna().sum())
print("-"*60)
print(df.duplicated().sum())
print("-"*60)
df.info()

(5659, 9)
------------------------------------------------------------
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64
------------------------------------------------------------
0
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      5659 non-null   object 
 1   Duration  5659 non-null   object 
 2   Genre     5659 non-null   object 
 3   Rating    5659 non-null   float64
 4   Votes     5659 non-null   object 
 5   Director  5659 non-null   object 
 6   Actor 1   5659 non-null   object 
 7   Actor 2   5659 non-null   object 
 8   Actor 3   5659 non-null   object 
dtypes: float64(1), object(8)
memory usage: 442.1+ KB


In [4]:
df.nunique()

Year          91
Duration     174
Genre        376
Rating        83
Votes       2027
Director    2431
Actor 1     1960
Actor 2     2321
Actor 3     2556
dtype: int64

In [5]:
# Handle missing values
df.fillna(method='ffill', inplace=True)

In [6]:
df.drop(columns=['Duration'],axis=1,inplace=True)
df.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,(2019),Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,(2019),"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,(1997),"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,(2005),"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,(2012),"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [7]:

cat = list(df.columns[df.dtypes=='object'])

In [8]:

mn = df['Rating'].mean()
df['Rating'].fillna(mn,inplace=True)

for i in cat:
    m = df[i].mode()[0]
    df[i].fillna(m,inplace=True)

In [9]:
df.isna().sum()

Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [10]:
df.nunique()

Year          91
Genre        376
Rating        83
Votes       2027
Director    2431
Actor 1     1960
Actor 2     2321
Actor 3     2556
dtype: int64

In [11]:
df['Year'] = df['Year'].astype(str).str.replace('(', '').str.replace(')', '').astype(float)

In [12]:
# Check if the 'Director' column exists in your DataFrame
print(df.columns)

## performing mean encoding
if 'Genre' in df.columns:
    df['Genre encoded'] = round(df.groupby('Genre')['Rating'].transform('mean'),1)
else:
    print("The 'Genre' column is not present in the DataFrame. Please check your previous steps.")

# Check if 'Votes' column exists before encoding
if 'Votes' in df.columns:
    df['Votes encoded'] = round(df.groupby('Votes')['Rating'].transform('mean'),1)
else:
    print("The 'Votes' column is not present in the DataFrame. Please check your previous steps.")

# Check if 'Director' column exists before encoding
if 'Director' in df.columns:
    df['Director encoded'] = round(df.groupby('Director')['Rating'].transform('mean'),1)
else:
    print("The 'Director' column is not present in the DataFrame. Please check your previous steps.")

# Similarly, check for other columns before encoding
if 'Actor 1' in df.columns:
    df['Actor 1 encoded'] = round(df.groupby('Actor 1')['Rating'].transform('mean'),1)
else:
    print("The 'Actor 1' column is not present in the DataFrame. Please check your previous steps.")

if 'Actor 2' in df.columns:
    df['Actor 2 encoded'] = round(df.groupby('Actor 2')['Rating'].transform('mean'),1)
else:
    print("The 'Actor 2' column is not present in the DataFrame. Please check your previous steps.")

if 'Actor 3' in df.columns:
    df['Actor 3 encoded'] = round(df.groupby('Actor 3')['Rating'].transform('mean'),1)
else:
    print("The 'Actor 3' column is not present in the DataFrame. Please check your previous steps.")

# Keep this line commented to retain the columns for encoding
# df.drop(columns=['Genre','Votes','Director','Actor 1','Actor 2','Actor 3'],inplace=True)
df['Rating'] = round(df['Rating'],1)
df.head()

Index(['Year', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2',
       'Actor 3'],
      dtype='object')


Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded
1,2019.0,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.4,5.6,7.0,6.8,7.0,7.0
3,2019.0,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.7,5.6,4.4,5.4,4.4,4.4
5,1997.0,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.2,4.8,5.3,4.8,5.8,5.9
6,2005.0,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,6.8,7.4,7.4,5.4,6.9,6.5
8,2012.0,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.5,6.6,5.6,5.6,5.9,5.6


In [13]:
#Separate X and Y features
X = df.drop(columns='Rating')
Y =df[['Rating']]
X.head()

Unnamed: 0,Year,Genre,Votes,Director,Actor 1,Actor 2,Actor 3,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded
1,2019.0,Drama,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.4,5.6,7.0,6.8,7.0,7.0
3,2019.0,"Comedy, Romance",35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.7,5.6,4.4,5.4,4.4,4.4
5,1997.0,"Comedy, Drama, Musical",827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.2,4.8,5.3,4.8,5.8,5.9
6,2005.0,"Drama, Romance, War",1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,6.8,7.4,7.4,5.4,6.9,6.5
8,2012.0,"Horror, Mystery, Thriller",326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.5,6.6,5.6,5.6,5.9,5.6


In [14]:
Y.head()

Unnamed: 0,Rating
1,7.0
3,4.4
5,4.7
6,7.4
8,5.6


In [15]:
#Split the dataset into training data and testing data
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.25,random_state=21)

In [16]:
xtrain.shape

(4244, 13)

In [17]:
xtest.shape

(1415, 13)

In [18]:
ytrain.shape

(4244, 1)

In [19]:
ytest.shape

(1415, 1)

In [20]:
#BUILD THE MODEL
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

In [21]:
dct = {
    'Linear':LinearRegression(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'GradientBoosting':GradientBoostingRegressor(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR()
}

In [22]:
dct.items()

dict_items([('Linear', LinearRegression()), ('DecisionTree', DecisionTreeRegressor()), ('RandomForest', RandomForestRegressor()), ('GradientBoosting', GradientBoostingRegressor()), ('KNN', KNeighborsRegressor()), ('SVR', SVR())])

In [23]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

In [24]:
# Assuming 'xtrain' and 'xtest' are Pandas DataFrames
xtrain = xtrain.select_dtypes(include=['number'])
xtest = xtest.select_dtypes(include=['number'])

train_mse = []
train_r2 = []
test_mse = []
test_r2 = []
train_cv = []

for name,model in dct.items():
    # fit the model
    m = model.fit(xtrain,ytrain)
    ypred_train = m.predict(xtrain)
    ypred_test = m.predict(xtest)
    # calculate MSE
    mse_train = mean_squared_error(ytrain,ypred_train)
    mse_test = mean_squared_error(ytest,ypred_test)
    # caluclate R2
    r2_train = (r2_score(ytrain,ypred_train))*100
    r2_test = (r2_score(ytest,ypred_test))*100
    # calculate cross validated scores
    cv = cross_val_score(m,xtrain,ytrain,cv=5,scoring='r2')
    scores = (cv.mean())*100

    # add these values to the respective list to compare the output
    train_mse.append(mse_train)
    train_r2.append(r2_train)
    test_mse.append(mse_test)
    test_r2.append(r2_test)
    train_cv.append(scores)

    # print the results
    print(f'Scores for {name}')
    print("Training Scores")
    print(f'MSE:{train_mse}')
    print(f'R2:{train_r2}')
    print("Testing Scores")
    print(f'MSE:{test_mse}')
    print(f'R2:{test_r2}')
    print(f'CV:{train_cv}')
    print("===============================")

Scores for Linear
Training Scores
MSE:[0.2940698193885788]
R2:[84.61563013465316]
Testing Scores
MSE:[0.30938845577631674]
R2:[83.66104188880601]
CV:[84.48837854948566]
Scores for DecisionTree
Training Scores
MSE:[0.2940698193885788, 1.0775042554083536e-31]
R2:[84.61563013465316, 100.0]
Testing Scores
MSE:[0.30938845577631674, 0.48863604240282693]
R2:[83.66104188880601, 74.19488775556782]
CV:[84.48837854948566, 72.16138457343646]
Scores for RandomForest
Training Scores
MSE:[0.2940698193885788, 1.0775042554083536e-31, 0.03427248515551365]
R2:[84.61563013465316, 100.0, 98.20702243795948]
Testing Scores
MSE:[0.30938845577631674, 0.48863604240282693, 0.2393426268551237]
R2:[83.66104188880601, 74.19488775556782, 87.36019692591145]
CV:[84.48837854948566, 72.16138457343646, 86.7359234846682]
Scores for GradientBoosting
Training Scores
MSE:[0.2940698193885788, 1.0775042554083536e-31, 0.03427248515551365, 0.21433380097384866]
R2:[84.61563013465316, 100.0, 98.20702243795948, 88.78704902229286]
T

In [25]:
res = {'Name':list(dct.keys()),
       'MSE Training Scores':train_mse,
       'MSE Testing Scores':test_mse,
       'R2 Training Scores':train_r2,
       'R2 Testing Scores':test_r2,
       'CV Training Scores':train_cv}

In [26]:
df_res = pd.DataFrame(res)
df_res.sort_values('CV Training Scores',ascending=False)

Unnamed: 0,Name,MSE Training Scores,MSE Testing Scores,R2 Training Scores,R2 Testing Scores,CV Training Scores
2,RandomForest,0.03427249,0.239343,98.207022,87.360197,86.735923
3,GradientBoosting,0.2143338,0.256375,88.787049,86.460712,86.346061
0,Linear,0.2940698,0.309388,84.61563,83.661042,84.488379
4,KNN,0.2283073,0.347898,88.056023,81.627336,81.079652
1,DecisionTree,1.0775040000000002e-31,0.488636,100.0,74.194888,72.161385
5,SVR,1.85721,1.841699,2.839364,2.738937,2.19277


In [27]:
#Lets consider Random Forest Regressor as its giving good results
params = {'n_estimators':[200,300],
          'max_depth':[5,6,7,8],
          'min_samples_split':[2,3,4,5,6],
          'criterion':['squared_error','absolute_error']}


In [28]:
rfr = RandomForestRegressor()
rscv = RandomizedSearchCV(rfr,params,cv=3,scoring='neg_mean_squared_error')
rscv.fit(xtrain,ytrain)

In [29]:
rscv.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'max_depth': 8,
 'criterion': 'squared_error'}

In [37]:
best_rfr = rscv.best_estimator_
best_rfr

In [38]:
from xgboost import XGBRegressor

In [39]:
model = XGBRegressor()
model.fit(xtrain,ytrain)

In [41]:
model.score(xtrain,ytrain)

0.9741072782767883

In [42]:
model.score(xtest,ytest)

0.8584583505830599

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
params = {'n_estimators':[200,300,500,600,800,1000],
          'learning_rate':[0.05,0.1,0.2,0.3],
          'max_depth':[5,6,7,8,9,10],
          'min_child_weight':[1,2,3],
          'objective':['reg:squarederror'],
          'gamma':[0.1,0.2,0.3,0.4]}

In [45]:
gscv = GridSearchCV(model,params,scoring='neg_mean_squared_error',cv=5)
gscv.fit(xtrain,ytrain)

In [46]:
gscv.best_params_

{'gamma': 0.1,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 1,
 'n_estimators': 300,
 'objective': 'reg:squarederror'}

In [47]:
best_xgb = gscv.best_estimator_
best_xgb

In [48]:
best_xgb.score(xtrain,ytrain)

0.9304838598400709

In [49]:
best_xgb.score(xtest,ytest)

0.8737523506209737

In [50]:
params1 = {'subsample':[0.5,0.6,0.7,0.8,0.9,1],
           'colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1]}

In [51]:
gscv1 = GridSearchCV(best_xgb,params1,cv=5,scoring='neg_mean_squared_error')
gscv1.fit(xtrain,ytrain)

In [52]:
gscv1.best_params_

{'colsample_bytree': 0.6, 'subsample': 0.9}

In [53]:
best_xgb2 = gscv1.best_estimator_
best_xgb2

In [54]:
best_xgb2.score(xtrain,ytrain)

0.9401482279025377

In [55]:
best_xgb2.score(xtest,ytest)

0.8771893548100101

In [56]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [57]:
def eval_model(model,xtrain,ytrain):
    # Fit the model
    model.fit(xtrain,ytrain)
    # Predict the scores
    ypred_test = model.predict(xtest)
    # Calculate MSE,RMSE,MAE,R2 scores
    mse = mean_squared_error(ytest,ypred_test)
    rmse = mse**(1/2)
    mae = mean_absolute_error(ytest,ypred_test)
    r2 = r2_score(ytest,ypred_test)
    return mse,rmse,mae,r2

In [58]:
(MSE,RMSE,MAE,r2) = eval_model(best_rfr,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.25290091123151476
RMSE:0.5028925444183029
MAE:0.3301853603719677
R2:0.8664417718972042


In [59]:
(MSE,RMSE,MAE,r2) = eval_model(best_xgb2,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Evaluation Metrics: 
MSE: 0.2325496865200513
RMSE:0.48223405781845324
MAE:0.3311009914040987
R2:0.8771893548100101


In [60]:
ypred_test = best_xgb2.predict(xtest)
ypred_test[:10]

array([5.728131 , 4.059104 , 7.0654   , 5.2473345, 4.092805 , 6.2569747,
       4.3712974, 4.9647393, 4.4704857, 7.19767  ], dtype=float32)

In [61]:
ytest.head(10)

Unnamed: 0,Rating
2273,5.8
12723,4.1
2585,7.0
5918,5.2
13675,4.2
1515,6.4
15265,4.9
6981,5.0
12748,5.1
10705,8.0


In [63]:
df_final = xtest
df_final['Predicted_Rating'] = ypred_test
df_final

Unnamed: 0,Year,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded,Predicted_Rating
2273,2010.0,5.8,5.5,5.7,5.8,5.8,5.8,5.728131
12723,2006.0,5.8,4.1,4.7,5.2,5.0,6.0,4.059104
2585,1971.0,5.8,5.2,7.3,6.4,6.2,7.4,7.065400
5918,2006.0,6.0,5.2,5.1,5.5,5.8,5.5,5.247334
13675,2021.0,5.6,5.3,4.2,4.2,4.2,4.2,4.092805
...,...,...,...,...,...,...,...,...
11594,2018.0,6.1,5.8,5.6,5.8,6.4,6.4,5.987141
15162,2020.0,5.6,5.2,5.1,5.1,5.1,5.1,4.909695
13914,2015.0,5.6,5.5,4.0,5.2,4.0,4.0,4.081908
15453,1986.0,6.4,5.5,3.9,3.9,3.9,5.6,3.847396


In [64]:
df_final.to_csv('Predicted Ratings.csv',index=False)