# 🎬 IMDb 1000 Linear Regression Lab
This notebook covers:
- Strategy 1: All features for predicting IMDb Rating
- Strategy 2: Selected features for prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

## 📥 Load Dataset and Clean

In [None]:
df = pd.read_csv('/mnt/data/imdb_top_1000.csv')
df.dropna(inplace=True)
df['Gross'] = df['Gross'].replace('[\$,]', '', regex=True).astype(float)
df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(int)

## 🔤 Encode Categorical Columns

In [None]:
le = LabelEncoder()
for col in ['Genre', 'Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']:
    df[col] = le.fit_transform(df[col])

## 🚀 Strategy 1: All Features

In [None]:
X1 = df.drop(['Series_Title', 'Overview', 'IMDB_Rating'], axis=1)
y1 = df['IMDB_Rating']
lr1 = LinearRegression()
lr1.fit(X1, y1)
print("R² Score (All Features):", lr1.score(X1, y1))

## ✂️ Strategy 2: Selected Features

In [None]:
features = ['Gross', 'No_of_Votes', 'Meta_score', 'Runtime']
X2 = df[features]
y2 = df['IMDB_Rating']

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.25, random_state=42)

lr2 = LinearRegression()
lr2.fit(X_train, y_train)
print("R² Score (Selected Features):", lr2.score(X_test, y_test))

## 📊 Plot: No of Votes vs IMDB Rating

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['No_of_Votes'], df['IMDB_Rating'], alpha=0.5)
plt.xscale('log')
plt.xlabel('Number of Votes')
plt.ylabel('IMDB Rating')
plt.title('Votes vs IMDb Rating')
plt.grid(True)
plt.show()