In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

# **MOVIE RATING PREDICTION**

## **Import Libraries**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor

## **Load Dataset**

In [3]:
df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv',na_values=(' '),encoding='latin-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


## **EDA**

In [4]:
print("1. Columns :\n",df.columns)
df.drop_duplicates(inplace=True)
print("\n2. Shape : ",df.shape)
print(df.info)

1. Columns :
 Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

2. Shape :  (15503, 10)
<bound method DataFrame.info of                                      Name    Year Duration            Genre  \
0                                     NaN     NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
1

In [5]:
df.drop(columns=['Name','Duration'],axis=1,inplace=True)
df.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,(2019),Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,(2021),"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,(2019),"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,(2010),Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [6]:
df.nunique()

Year         102
Genre        485
Rating        84
Votes       2034
Director    5938
Actor 1     4718
Actor 2     4891
Actor 3     4820
dtype: int64

## **Fill empty spaces with mean and mode values**

In [7]:
df['Rating'].fillna(df['Rating'].mean(),inplace=True)

In [8]:
cat = list(df.columns[df.dtypes=='object'])
print(cat)

['Year', 'Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


In [9]:
for i in cat:
    df[i].fillna(df[i].mode()[0],inplace=True)

In [10]:
df.isna().sum()

Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

## **Converting "Year" and "Votes" to integer**

In [11]:
df['Year'] = df['Year'].astype(str)
df['Votes'] = df['Votes'].astype(str)
df['Year'] = df['Year'].str.strip('()').astype(int)
df['Votes'] = df['Votes'].str.replace(',','')
df['Votes'] = df['Votes'].str.replace('.','')
df['Votes'] = df['Votes'].str.strip('$')
df['Votes'] = df['Votes'].str.strip('M')
df['Votes'] = df['Votes'].astype(int)

## **Categorical Fields**

In [12]:
cat = list(df.columns[df.dtypes=='object'])
print(cat)

['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


## **Final Encoding**

In [13]:
df['Genre encoded'] = round(df.groupby('Genre')['Rating'].transform('mean'),1)
df['Votes encoded'] = round(df.groupby('Votes')['Rating'].transform('mean'),1)
df['Director encoded'] = round(df.groupby('Director')['Rating'].transform('mean'),1)
df['Actor 1 encoded'] = round(df.groupby('Actor 1')['Rating'].transform('mean'),1)
df['Actor 2 encoded'] = round(df.groupby('Actor 2')['Rating'].transform('mean'),1)
df['Actor 3 encoded'] = round(df.groupby('Actor 3')['Rating'].transform('mean'),1)

df.drop(columns=['Genre','Votes','Director','Actor 1','Actor 2','Actor 3'],inplace=True)
df['Rating'] = round(df['Rating'],1)
df.head()

Unnamed: 0,Year,Rating,Genre encoded,Votes encoded,Director encoded,Actor 1 encoded,Actor 2 encoded,Actor 3 encoded
0,2019,5.8,6.0,5.8,5.8,5.8,5.5,5.8
1,2019,7.0,6.0,5.8,7.0,6.8,7.0,7.0
2,2021,5.8,6.3,5.8,5.8,6.2,6.8,5.8
3,2019,4.4,5.7,5.9,4.4,5.4,4.4,4.4
4,2010,5.8,6.0,5.8,6.3,6.8,5.8,5.5


## **Split the Dataset for Training and Testing**

In [14]:
x = df.drop(columns='Rating')
y = df[['Rating']]

In [15]:
xtrain, xtest,ytrain, ytest = train_test_split(x,y,test_size=0.25,random_state=42,shuffle=True)

In [16]:
print("X Train : ",xtrain.shape,"|  Y Train : ",ytrain.shape)
print("X Test  : ",xtest.shape," |  Y Test  : ",ytest.shape)

X Train :  (11627, 7) |  Y Train :  (11627, 1)
X Test  :  (3876, 7)  |  Y Test  :  (3876, 1)


## **Models**

In [17]:
models={'DecisionTree':DecisionTreeRegressor(),'RandomForest':RandomForestRegressor(),'KNeighbors':KNeighborsRegressor()}
print(models.items())

dict_items([('DecisionTree', DecisionTreeRegressor()), ('RandomForest', RandomForestRegressor()), ('KNeighbors', KNeighborsRegressor())])


## **Training, Testing and Results of the Models**

In [18]:
for name,model in models.items():
    m = model.fit(xtrain,ytrain)
    ypred = m.predict(xtest)
    msetest = mean_squared_error(ytest,ypred)
    r2test = (r2_score(ytest,ypred))
    cv = cross_val_score(m,xtest,ytest,cv=5,scoring='r2')
    scores = (cv.mean())*100

    print(f'Scores for {name}')
    print(f'     MSE : {msetest}')
    print(f'      R2 : {r2test}')
    print(f'      CV : {scores}')
    print("===============================")

Scores for DecisionTree
     MSE : 0.42115874899667466
      R2 : 0.5759772467013854
      CV : 55.570478069446324
Scores for RandomForest
     MSE : 0.2016404368968769
      R2 : 0.7969883483768625
      CV : 77.65003095855639
Scores for KNeighbors
     MSE : 0.28137512899896805
      R2 : 0.7167114367394103
      CV : 67.73174626853063


## **Conclusion**
**Based on the evaluation, the RandomForest model stands out with the highest cross-validated R² of 77.67, indicating it generalizes best to unseen data and offers consistent performance. While KNeighbors also shows good performance with a cross-validated R² of 67.73, it's less robust than RandomForest but still better than DecisionTree, which, despite being easier to interpret, has a lower cross-validated R² of 55.95 and is more prone to overfitting. Therefore, RandomForest is the recommended model for this dataset due to its superior predictive power and reliability.**