In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Load the dataset
df = pd.read_csv('IMDb_Movies_India.csv', encoding='ISO-8859-1')


In [3]:
df.sample(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
15141,Yaan,(2014),153 min,"Action, Thriller",3.5,341.0,Ravi K. Chandran,Jiiva,Thulasi Nair,Nassar
6133,Inspector,(1957),,Action,5.6,15.0,Shakti Samanta,Ashok Kumar,Geeta Bali,Mehmood
11330,Q,(2017),96 min,"Drama, Thriller",6.0,29.0,Sanjeev Gupta,Heeba Shah,Ashwin Shukla,
8341,Love Scope,(2020),,"Comedy, Mystery, Romance",,,Sunny Kumar,Shakti Kapoor,Smita Jaykar,Swati Kapoor
7066,Kachche Rishtey,(2000),,Thriller,,,Abid,Dilip Advani,Sushil Chowdhary,Chetana Gupta


In [4]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [5]:
# Check for missing values
print(df.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [6]:
# Fill missing values
# Impute numerical columns with mean
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
imputer_num = SimpleImputer(strategy='mean')
df[numerical_features] = imputer_num.fit_transform(df[numerical_features])


# Impute categorical columns with most frequent value
categorical_features = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_features] = imputer_cat.fit_transform(df[categorical_features])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
print(df.isnull().sum())

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64


In [8]:
# Assuming 'rating' is the target variable and other columns are features
X = df.drop('Rating', axis=1)
y = df['Rating']

In [9]:
# Convert categorical features to dummy variables
X = pd.get_dummies(X, drop_first=True)

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# K-Nearest Neighbors Regression
knn = KNeighborsRegressor()
param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}
grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='r2')
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)
print('K-Nearest Neighbors Regression:')
print(f'Best Params: {grid_knn.best_params_}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_knn))}')
print(f'R2 Score: {r2_score(y_test, y_pred_knn)}')

In [None]:
# XGBoost Regression
xgb = XGBRegressor()
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='r2')
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
print('XGBoost Regression:')
print(f'Best Params: {grid_xgb.best_params_}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_xgb))}')
print(f'R2 Score: {r2_score(y_test, y_pred_xgb)}')

In [None]:
# K-Nearest Neighbors plot
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_knn, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('K-Nearest Neighbors')

In [None]:
# XGBoost plot
plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_xgb, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('XGBoost')

plt.tight_layout()
plt.show()


In [None]:
# Predicting new data
# Creating a new sample input (replace with actual new data)
new_data = pd.DataFrame({
    'Name': ['New Movie'],
    'Year': [2024],
    'Duration': [120],
    'Genre': ['Action'],
    'Votes': [5000],
    'Director': ['John Doe'],
    'Actor 1': ['Actor A'],
    'Actor 2': ['Actor B'],
    'Actor 3': ['Actor C']
})

# Handling missing values in new data
new_data[numerical_features] = imputer_num.transform(new_data[numerical_features])
new_data[categorical_features] = imputer_cat.transform(new_data[categorical_features])

# Convert categorical features to dummy variables
new_data = pd.get_dummies(new_data, drop_first=True)

# Align the new data with the training data to ensure all columns are present
new_data = new_data.reindex(columns=X.columns, fill_value=0)

# Scaling the new data
new_data_scaled = scaler.transform(new_data)

# Making predictions
new_pred_svr = best_svr.predict(new_data_scaled)
new_pred_knn = best_knn.predict(new_data_scaled)
new_pred_xgb = best_xgb.predict(new_data_scaled)

# Print predictions
print('New Data Predictions:')
print(f'SVR Prediction: {new_pred_svr[0]}')
print(f'KNN Prediction: {new_pred_knn[0]}')
print(f'XGBoost Prediction: {new_pred_xgb[0]}')