In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib

In [19]:
# Load the dataset
file_path = "IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='latin1')

In [20]:
# Display basic information
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


None

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [31]:
if df['Year'].dtype == 'O':  # Check if 'Year' is an object (string)
    df['Year'] = df['Year'].str.extract('(\\d{4})').astype(float)

if df['Duration'].dtype == 'O':  # Check if 'Duration' is an object (string)
    df['Duration'] = df['Duration'].str.extract('(\d+)').astype(float)

In [32]:
# Clean 'Votes' column
df['Votes'] = df['Votes'].astype(str).str.replace(',', '', regex=True)  # Remove commas
df['Votes'] = df['Votes'].apply(lambda x: x if x.replace('.', '', 1).isdigit() else np.nan)  # Keep only numeric values
df['Votes'] = df['Votes'].astype(float)

In [33]:
# Handling missing values
df.fillna({'Rating': df['Rating'].median(), 'Votes': df['Votes'].median(), 'Year': df['Year'].median(), 'Duration': df['Duration'].median()}, inplace=True)
df.dropna(subset=['Genre', 'Director'], inplace=True)

In [34]:
# Feature Engineering: Director Success Rate
director_avg_rating = df.groupby('Director')['Rating'].mean().to_dict()
df['Director_Success'] = df['Director'].map(director_avg_rating)
df['Director_Success'].fillna(df['Director_Success'].median(), inplace=True)

In [35]:
# Encoding categorical variables
le = LabelEncoder()
df['Genre'] = le.fit_transform(df['Genre'])
df['Director'] = le.fit_transform(df['Director'])

In [36]:
# Selecting features
features = ['Year', 'Duration', 'Votes', 'Genre', 'Director_Success']
X = df[features]
y = df['Rating']


In [37]:
# Remove rows where y is NaN
X = X[~y.isna()]
y = y.dropna()


In [38]:
# Ensure no NaN values using an imputer
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)


In [39]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
# Scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [41]:
# Train a predictive model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [42]:
# Predictions
y_pred = model.predict(X_test)


In [43]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.35742680641745467
R-squared: 0.6324318966479696


In [44]:
# Save the model
joblib.dump(model, 'movie_rating_model.pkl')


['movie_rating_model.pkl']