# Loading the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the data

In [None]:
MP = pd.read_csv('/content/IMDb Movies India.csv',encoding='ISO-8859-1')

# Data Pre-Processing


**Handling Missing Values**

In [None]:
print(MP.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [None]:
# Drop rows with missing ratings since its the target variable
MP = MP.dropna(subset=['Rating'])

In [None]:
# Fill Other Missing Values with a Placeholder or drop them based on requirements
MP['Genre'] = MP['Genre'].fillna('Unknown')
MP['Director'] = MP['Director'].fillna('Unknown')
MP['Duration'] = pd.to_numeric(MP['Duration'].str.extract('(\d+)').squeeze(), errors='coerce')
MP['Duration'] = MP['Duration'].fillna(MP['Duration'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MP['Genre'] = MP['Genre'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MP['Director'] = MP['Director'].fillna('Unknown')


# Data Cleaning

In [None]:
# # Remove 'min' from 'Duration' and convert to numeric
if MP['Duration'].dtype == object:
    MP['Duration'] = pd.to_numeric(MP['Duration'].str.extract('(\d+)').squeeze(), errors='coerce')

# Remove commas from 'Votes' and convert to numeric
MP['Votes'] = MP['Votes'].str.replace(',', '').astype(float)

#Feature Engineering

In [None]:
# Data Preprocessing (Handle missing values as before)
MP.dropna(subset=['Rating'], inplace=True)

# One-hot encode Genre
MP = pd.get_dummies(MP, columns=['Genre'], prefix=['Genre'])


# Select features and target variable
X = MP[['Genre_Action', 'Genre_Comedy', 'Genre_Drama',
         # Add other relevant features here...
         ]]
y = MP['Rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building (Regression)

# 1. Linear Regression with Cross-Validation
model_lr = LinearRegression()
cv_scores = cross_val_score(model_lr, X_train, y_train, cv=5, scoring='r2')
print(f"Linear Regression Cross-Validation R-squared: {np.mean(cv_scores)}")

model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - Mean Squared Error: {mse_lr}")
print(f"Linear Regression - R-squared: {r2_lr}")


# 2. Random Forest Regression with Hyperparameter Tuning
model_rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid,
                           cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - Mean Squared Error: {mse_rf}")
print(f"Random Forest - R-squared: {r2_rf}")

# Compare and choose the best model based on performance metrics

Linear Regression Cross-Validation R-squared: 0.029748968311637626
Linear Regression - Mean Squared Error: 1.7563576495646047
Linear Regression - R-squared: 0.05528750221319856
Random Forest - Mean Squared Error: 1.7560802094257766
Random Forest - R-squared: 0.055436732164514946


In [None]:
# prompt: Compare and choose the best model based on performance metrics

# Compare models based on R-squared and MSE
if r2_lr > r2_rf and mse_lr < mse_rf:
  print("Linear Regression is the best performing model.")
elif r2_rf > r2_lr and mse_rf < mse_lr:
  print("Random Forest is the best performing model.")
else:
  print("The models have comparable performance. Further analysis might be needed.")

# You can also consider other metrics like MAE or adjusted R-squared
# for a more comprehensive evaluation.

# Choose the best model based on the comparison
best_model = model_lr if r2_lr > r2_rf else model_rf

Random Forest is the best performing model.


In [None]:

# Assuming 'best_model' is already defined (either model_lr or model_rf)
# Create a new data point for prediction (replace with your actual data)
new_data_point = pd.DataFrame({
    'Genre_Action': [0],
    'Genre_Comedy': [1],
    'Genre_Drama': [0],
    # Add other relevant features with values for your new data point
})


# Scale the new data point using the same scaler you used for training
new_data_point_scaled = scaler.transform(new_data_point)


# Compare models based on R-squared and MSE
if r2_lr > r2_rf and mse_lr < mse_rf:
  print("Linear Regression is the best performing model.")
  best_model = model_lr  # Assign the fitted model_lr
elif r2_rf > r2_lr and mse_rf < mse_lr:
  print("Random Forest is the best performing model.")
  best_model = best_rf_model # Assign the fitted best_rf_model
else:
  print("The models have comparable performance. Further analysis might be needed.")
  # Choose one of the models and assign it to best_model (e.g., best_model = model_lr)

Random Forest is the best performing model.
