In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


In [None]:
# Load the dataset
data_path = r"C:\Users\subra\Desktop\Projects\Codsoft\Movie_Rating_Prediction\data\IMDb Movies India.csv"
df = pd.read_csv(data_path)


In [None]:
# Display basic information about the dataset
print("First five rows of the dataset:")
print(df.head())

print("\nSummary statistics of the dataset:")
print(df.describe())

print("\nDataset information:")
print(df.info())


In [None]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


In [None]:
# Handle missing values (fill with median for numerical columns)
df['rating'].fillna(df['rating'].median(), inplace=True)  # Fill missing ratings
df['budget'].fillna(df['budget'].median(), inplace=True)  # Fill missing budget
df['revenue'].fillna(df['revenue'].median(), inplace=True)  # Fill missing revenue
df['runtime'].fillna(df['runtime'].median(), inplace=True)  # Fill missing runtime


In [None]:
# Exploratory Data Analysis (EDA)
# Distribution of movie ratings
plt.figure(figsize=(8, 6))
sns.histplot(df['rating'], kde=True, bins=30, color='blue')
plt.title("Distribution of Movie Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Feature and target selection
X = df[['budget', 'revenue', 'runtime']]  # Features
y = df['rating']  # Target


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Train the model using Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation Metrics:\nMean Squared Error: {mse:.2f}\nMean Absolute Error: {mae:.2f}\nR^2 Score: {r2:.2f}")


In [None]:
# Visualize predicted vs actual ratings
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7, color='green')
plt.title("Predicted vs Actual Ratings")
plt.xlabel("Actual Ratings")
plt.ylabel("Predicted Ratings")
plt.show()
