# import libraries

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset

In [38]:
data = pd.read_csv("C:\\Users\\PMLS\\.anaconda\\IMDb Movies India.csv", encoding="latin1")

# Explore the data

In [39]:
print(data.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [40]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


#  Handle missing values

In [41]:
data.fillna(data.mean(numeric_only=True), inplace=True)
data.dropna(inplace=True)

In [42]:
data['Votes'] = data['Votes'].astype(str).str.replace(',', '').astype(float)


# Select relevant features

In [43]:
features=['Genre','Votes','Director','Actor 1','Actor 2','Actor 3']
target='Rating'

In [44]:
x=data[features]
y=data[target]

# Preprocess categorical and numerical features

In [45]:
categorical_features=['Genre','Votes','Director','Actor 1','Actor 2','Actor 3']
numerical_features=['Votes']

# One-hot encode categorical features

In [46]:
encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
x_encoded=pd.DataFrame(encoder.fit_transform(x[categorical_features]),index=x.index)



# Scale numerical features

In [47]:

scaler = StandardScaler()
x_scaled = pd.DataFrame(scaler.fit_transform(x[numerical_features]), index=x.index)

# Combine processed features

In [48]:
x_processed=pd.concat([x_encoded,x_scaled],axis=1)

# Train-test split

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x_processed,y,test_size=0.2,random_state=42)

# Train a Random Forest Regressor

In [50]:
model=RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train,y_train)

# Predictions

In [51]:
y_pred=model.predict(x_test)

In [52]:
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
print(f"RMSE: {rmse}")
print(f"R^2:{r2}")

RMSE: 1.2220742937748508
R^2:0.19348146131049193


In [53]:
y_pred

array([5.688, 6.607, 5.946, ..., 6.117, 6.751, 6.241])