In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import re
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv("IMDb Movies India.csv", encoding="latin1")
print("Shape:", df.shape)
print("Columns:", df.columns)

Shape: (15509, 10)
Columns: Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


In [3]:
df = df.dropna(subset=['Rating'])  # drop movies with no rating
df['Genre'] = df['Genre'].fillna("Unknown")
df['Director'] = df['Director'].fillna("Unknown")
df['Actor 1'] = df['Actor 1'].fillna("Unknown")
df['Actor 2'] = df['Actor 2'].fillna("Unknown")
df['Actor 3'] = df['Actor 3'].fillna("Unknown")

In [4]:
df['Year'] = df['Title'].str.extract(r'\((\d{4})\)').astype(float)

KeyError: 'Title'

In [None]:
label_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
encoder = LabelEncoder()
for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

In [None]:
X = df[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Year']]
y = df['Rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))