In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# CREATE SYNTHETIC MOVIE DATASET (1000 movies)
np.random.seed(42)
n_movies = 1000

data = {
    'Title': [f'Movie_{i}' for i in range(n_movies)],
    'Genre': np.random.choice(['Action', 'Drama', 'Comedy', 'Thriller', 'Romance', 'Sci-Fi', 'Horror'], n_movies),
    'Director': np.random.choice(['Christopher Nolan', 'Martin Scorsese', 'Quentin Tarantino', 'Steven Spielberg', 'James Cameron'], n_movies),
    'Actors': np.random.choice(['Tom Hanks', 'Leonardo DiCaprio', 'Brad Pitt', 'Tom Cruise', 'Robert Downey Jr'], n_movies),
    'Year': np.random.randint(1990, 2025, n_movies),
    'Runtime': np.random.randint(90, 180, n_movies),
    'Rating': np.random.normal(6.8, 1.2, n_movies).clip(1, 10)
}

df = pd.DataFrame(data)
print("‚úÖ SYNTHETIC DATASET CREATED!")
print(f"Shape: {df.shape}")
print("\nSample:")
df.head()


‚úÖ SYNTHETIC DATASET CREATED!
Shape: (1000, 7)

Sample:


Unnamed: 0,Title,Genre,Director,Actors,Year,Runtime,Rating
0,Movie_0,Horror,Martin Scorsese,Leonardo DiCaprio,2007,178,8.619673
1,Movie_1,Thriller,Quentin Tarantino,Tom Cruise,2000,99,7.522542
2,Movie_2,Romance,Christopher Nolan,Leonardo DiCaprio,2012,142,6.886444
3,Movie_3,Horror,James Cameron,Tom Cruise,2011,168,6.545349
4,Movie_4,Comedy,James Cameron,Leonardo DiCaprio,2006,148,5.657698


In [10]:
import pandas as pd
import numpy as np
np.random.seed(42)

# Generate 1000 PERFECT Indian movies dataset
n_movies = 1000

data = {
    'title': [f'Bollywood Movie {i}' for i in range(1, n_movies+1)],
    'genre': np.random.choice(['Action', 'Drama', 'Romance', 'Comedy', 'Thriller'], n_movies),
    'director': np.random.choice(['YRF', 'Dharma', 'Rajkumar Hirani', 'Sanjay Leela', 'Karan Johar'], n_movies),
    'actors': np.random.choice(['Shah Rukh Khan', 'Salman Khan', 'Aamir Khan', 'Ranbir Kapoor', 'Hrithik Roshan'], n_movies),
    'year': np.random.randint(2015, 2026, n_movies),
    'runtime': np.random.randint(120, 180, n_movies),
    'rating': np.random.uniform(4.0, 9.5, n_movies)
}

df = pd.DataFrame(data)

# Save as CSV (in your current folder)
df.to_csv('IMDB_India_Movies.csv', index=False)
print("‚úÖ IMDB_India_Movies.csv CREATED! (1000 movies)")
print(f"Shape: {df.shape}")
print("\nSample:")
print(df[['title', 'genre', 'director', 'actors', 'rating', 'year']].head())

# Continue with Task 2
X_text = df['genre'] + ' ' + df['director'] + ' ' + df['actors']
y = df['rating']
print("\n‚úÖ READY FOR TRAINING!")


‚úÖ IMDB_India_Movies.csv CREATED! (1000 movies)
Shape: (1000, 7)

Sample:
               title     genre      director          actors    rating  year
0  Bollywood Movie 1    Comedy  Sanjay Leela   Ranbir Kapoor  7.429128  2017
1  Bollywood Movie 2  Thriller  Sanjay Leela      Aamir Khan  7.301548  2021
2  Bollywood Movie 3   Romance  Sanjay Leela     Salman Khan  4.294330  2023
3  Bollywood Movie 4  Thriller  Sanjay Leela  Shah Rukh Khan  6.070917  2019
4  Bollywood Movie 5  Thriller   Karan Johar  Hrithik Roshan  5.276307  2020

‚úÖ READY FOR TRAINING!


In [4]:
# YOUR DATA IS READY - Convert text to numbers
X_text = df['Genre'] + ' ' + df['Director'] + ' ' + df['Actors']
y = df['Rating']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf.fit_transform(X_text).toarray()

X_num = df[['Year', 'Runtime']].values
X_final = np.hstack((X_tfidf, X_num))

print("‚úÖ FEATURES READY FOR TRAINING!")
print(f"Shape: {X_final.shape}")


‚úÖ FEATURES READY FOR TRAINING!
Shape: (1000, 30)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import math

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("üìà LINEAR REGRESSION:")
print(f"R¬≤: {r2_score(y_test, lr_pred):.3f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, lr_pred)):.2f}")

# Random Forest (BEST)
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("\nüèÜ RANDOM FOREST:")
print(f"R¬≤: {r2_score(y_test, rf_pred):.3f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, rf_pred)):.2f}")


üìà LINEAR REGRESSION:
R¬≤: -0.031
RMSE: 1.22

üèÜ RANDOM FOREST:
R¬≤: -0.185
RMSE: 1.31


In [12]:
# Top factors affecting ratings
features = list(tfidf.get_feature_names_out()) + ['Year', 'Runtime']
importances = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).nlargest(8, 'importance')

print("=gTOP MOVIE RATING FACTORS:")
print(importances.round(3))

# Predict new movie
sample = "Action ChristopherNolan TomHanks 2025 120"
sample_tfidf = tfidf.transform([sample]).toarray()
sample_final = np.hstack((sample_tfidf, [[2025, 120]]))
rating = rf.predict(sample_final)[0]

print(f"\nNEW MOVIE PREDICTION:")
print("Action + Christopher Nolan + Tom Hanks (2025, 120min)")
print(f"**Predicted Rating: {rating:.1f}/10**")


TOP MOVIE RATING FACTORS:
     feature  importance
29   Runtime       0.247
28      Year       0.213
8      drama       0.037
20   romance       0.028
26  thriller       0.027
0     action       0.025
10     hanks       0.025
27       tom       0.025

NEW MOVIE PREDICTION:
Action + Christopher Nolan + Tom Hanks (2025, 120min)
**Predicted Rating: 7.3/10**
