<a href="https://colab.research.google.com/github/AnilSharma09/CODSOFT/blob/main/Task_2_Movie_Rating_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
Movie Rating Prediction - Working Solution for IMDb India Dataset
"""

# Step 1: Install required packages
!pip install pandas scikit-learn seaborn

# Step 2: Import libraries
from google.colab import files
import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 3: Upload and extract dataset
print("Please upload the IMDb Movies India.csv file")
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Handle both zip and direct CSV upload
if filename.endswith('.zip'):
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('.')
    csv_file = 'IMDb Movies India.csv'
else:
    csv_file = filename

# Step 4: Load data with robust encoding
encodings = ['utf-8', 'latin1', 'iso-8859-1']
for encoding in encodings:
    try:
        movies = pd.read_csv(csv_file, encoding=encoding)
        print(f"Successfully loaded with {encoding} encoding")
        break
    except:
        continue

# Step 5: Data Cleaning with Debugging
print("\nOriginal shape:", movies.shape)

# Select and rename relevant columns
cols = ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director']
movies = movies[cols].copy()
movies.columns = ['title', 'year', 'duration', 'genre', 'rating', 'votes', 'director']

# Clean year column
print("\nYear values before cleaning:", movies['year'].unique()[:10])
movies = movies[movies['year'].str.match(r'^\d+$', na=False)]
movies['year'] = movies['year'].astype(int)

# Clean duration column
print("\nDuration values before cleaning:", movies['duration'].unique()[:10])
movies['duration'] = pd.to_numeric(movies['duration'].str.extract(r'(\d+)')[0], errors='coerce')

# Clean rating column
movies['rating'] = pd.to_numeric(movies['rating'], errors='coerce')

# Clean votes column
movies['votes'] = pd.to_numeric(movies['votes'].str.replace(',', ''), errors='coerce')

# Remove rows with missing essential values
movies = movies.dropna(subset=['rating', 'duration', 'votes', 'genre', 'director'])
print("\nShape after cleaning:", movies.shape)

# Step 6: Feature Engineering
# Extract primary genre
movies['primary_genre'] = movies['genre'].str.split(',').str[0].str.strip()

# Create binary features for top genres
top_genres = ['Drama', 'Comedy', 'Action', 'Thriller', 'Romance']
for genre in top_genres:
    movies[f'has_{genre.lower()}'] = movies['genre'].str.contains(genre).astype(int)

# Create binary feature for top directors
top_directors = ['Rajkumar Hirani', 'S.S. Rajamouli', 'Sanjay Leela Bhansali', 'Karan Johar']
movies['has_top_director'] = movies['director'].isin(top_directors).astype(int)

# Step 7: Prepare final dataset
features = ['year', 'duration', 'votes',
           'has_drama', 'has_comedy', 'has_action',
           'has_top_director']

X = movies[features]
y = movies['rating']

print("\nFinal features shape:", X.shape)
print("Sample features:\n", X.head())

# Step 8: Train-test split
if len(X) > 0:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Step 9: Train and evaluate model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("\n=== Model Evaluation ===")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2 Score: {r2:.2f}")

    # Feature Importance
    plt.figure(figsize=(10,5))
    pd.Series(model.feature_importances_, index=X.columns
             ).sort_values().plot(kind='barh')
    plt.title("Feature Importance")
    plt.show()

    # Sample predictions
    print("\n=== Sample Predictions ===")
    sample_data = X_test.sample(3, random_state=42)
    for idx, row in sample_data.iterrows():
        actual = y_test.loc[idx]
        pred = model.predict([row])[0]
        print(f"\nMovie Features: {row.to_dict()}")
        print(f"Actual Rating: {actual:.1f} | Predicted: {pred:.1f}")
else:
    print("\nError: No valid data remaining after preprocessing!")
    print("Debugging info:")
    print("- Original shape:", movies.shape)
    print("- Missing values:\n", movies.isna().sum())
    print("- Unique genres:", movies['genre'].unique()[:10])
    print("- Unique directors:", movies['director'].unique()[:10])

Please upload the IMDb Movies India.csv file


Saving IMDb Movies India.csv.zip to IMDb Movies India.csv (8).zip
Successfully loaded with latin1 encoding

Original shape: (15509, 10)

Year values before cleaning: [nan '(2019)' '(2021)' '(2010)' '(1997)' '(2005)' '(2008)' '(2012)'
 '(2014)' '(2004)']

Duration values before cleaning: []

Shape after cleaning: (0, 7)

Final features shape: (0, 7)
Sample features:
 Empty DataFrame
Columns: [year, duration, votes, has_drama, has_comedy, has_action, has_top_director]
Index: []

Error: No valid data remaining after preprocessing!
Debugging info:
- Original shape: (0, 14)
- Missing values:
 title               0
year                0
duration            0
genre               0
rating              0
votes               0
director            0
primary_genre       0
has_drama           0
has_comedy          0
has_action          0
has_thriller        0
has_romance         0
has_top_director    0
dtype: int64
- Unique genres: []
- Unique directors: []
