In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = r"C:\Users\abeer\Downloads\New folder\imdb-movies-dataset.csv"
df = pd.read_csv(file_path)

# Data Preprocessing
df['Year'] = df['Year'].fillna(df['Year'].median()).astype(int)
df['Certificate'] = df['Certificate'].fillna('Not Rated')
df['Duration (min)'] = df['Duration (min)'].fillna(df['Duration (min)'].median())
df['Genre'] = df['Genre'].str.split(',').str[0]
df['Genre'] = df['Genre'].fillna('Unknown')
df['Votes'] = df['Votes'].str.replace(',', '').astype(float)
df['Votes'] = df['Votes'].fillna(df['Votes'].median())
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
df['Metascore'] = df['Metascore'].fillna(df['Metascore'].median())

# Encoding categorical features
label_encoder = LabelEncoder()
df['Certificate'] = label_encoder.fit_transform(df['Certificate'])
df['Genre'] = label_encoder.fit_transform(df['Genre'])
df['Director'] = label_encoder.fit_transform(df['Director'])
df['Cast'] = label_encoder.fit_transform(df['Cast'].str.split(',').str[0])

# Feature Selection
features = ['Year', 'Certificate', 'Duration (min)', 'Genre', 'Director', 'Cast', 'Votes', 'Metascore']
target = 'Rating'

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Final Model Evaluation
y_pred_best = best_model.predict(X_test)
best_mse = mean_squared_error(y_test, y_pred_best)
best_rmse = np.sqrt(best_mse)
best_r2 = r2_score(y_test, y_pred_best)

print(f'Best RMSE: {best_rmse}')
print(f'Best R^2: {best_r2}')

RMSE: 0.6930421585300566
R^2: 0.5569706416491684
