In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

In [9]:
# Load the dataset
data = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')

# Data Preprocessing

In [10]:
# Convert 'Year' to numeric and handle missing values
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')

In [11]:
# Convert 'Votes' to numeric and handle missing values
data['Votes'] = pd.to_numeric(data['Votes'].str.replace(',', ''), errors='coerce')

In [12]:
# Encode categorical variables
label_encoders = {}
for column in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column].astype(str))

In [13]:
# Handle missing values
numeric_features = ['Year', 'Votes']
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features)
    ])

# Feature Engineering

In [14]:
# Model Selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [15]:
# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [16]:
# Train/Test Split
X = data.drop(columns=['Name', 'Rating', 'Duration'])
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Identify missing values in the target variable
missing_target_indices = y_train.isnull()

# Impute missing values using mean
imputer = SimpleImputer(strategy='mean')
y_train_imputed = imputer.fit_transform(y_train.values.reshape(-1, 1))

# Convert back to 1D array
y_train_imputed = y_train_imputed.flatten()

# Update y_train with imputed values
y_train.loc[missing_target_indices] = y_train_imputed[missing_target_indices]


In [20]:
# Model Training
pipeline.fit(X_train, y_train)



In [22]:
# Identify missing values in the test set
missing_test_indices = y_test.isnull()

# Impute missing values using mean (assuming you've used mean imputation for training set)
y_test_imputed = imputer.transform(y_test.values.reshape(-1, 1))

# Convert back to 1D array
y_test_imputed = y_test_imputed.flatten()

# Update y_test with imputed values
y_test.loc[missing_test_indices] = y_test_imputed[missing_test_indices]

# Now evaluate your model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8007063129565821




In [29]:
# Assuming y_test and y_pred are your actual and predicted ratings, respectively
r_squared = r2_score(y_test, y_pred)
print('R-squared:', r_squared)

R-squared: 0.16833196461363387
