In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [5]:
df = pd.read_csv("imdb_top_1000.csv")

In [6]:
df.drop(columns=["Poster_Link", "Overview", "Series_Title"], inplace=True)


In [7]:
df["Runtime"] = df["Runtime"].str.replace(" min", "", regex=False).astype(float)


In [8]:
df.head(5)

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1994,A,142.0,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,1972,A,175.0,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,2008,UA,152.0,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,1974,A,202.0,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,1957,U,96.0,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [9]:
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].median())


In [10]:
for col in df.select_dtypes(exclude=np.number).columns:
    df[col] = df[col].fillna("Unknown")


In [11]:
df.head(30)

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1994,A,142.0,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,1972,A,175.0,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,2008,UA,152.0,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,1974,A,202.0,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,1957,U,96.0,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
5,2003,U,201.0,"Action, Adventure, Drama",8.9,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905
6,1994,A,154.0,"Crime, Drama",8.9,94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762
7,1993,A,195.0,"Biography, Drama, History",8.9,94.0,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505,96898818
8,2010,UA,148.0,"Action, Adventure, Sci-Fi",8.8,74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195
9,1999,A,139.0,Drama,8.8,66.0,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854740,37030102


In [12]:
X = df.drop("IMDB_Rating", axis=1)

y = df["IMDB_Rating"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
df.tail()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
995,1961,A,115.0,"Comedy, Drama, Romance",7.6,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,Unknown
996,1956,G,201.0,"Drama, Western",7.6,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,Unknown
997,1953,Passed,118.0,"Drama, Romance, War",7.6,85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,1944,Unknown,97.0,"Drama, War",7.6,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,Unknown
999,1935,Unknown,86.0,"Crime, Mystery, Thriller",7.6,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,Unknown


In [15]:
numeric_features = X_train.select_dtypes(include=np.number).columns
categorical_features = X_train.select_dtypes(exclude=np.number).columns


In [16]:
preprocessor = ColumnTransformer([("num", StandardScaler(), numeric_features),("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),

])

In [17]:
model = RandomForestRegressor(n_estimators=150, max_depth=20, random_state=42)


In [18]:
pipeline = Pipeline([("preprocess", preprocessor), ("model", model)])


In [21]:
pipeline.fit(X_train, y_train)


In [24]:
y_pred = pipeline.predict(X_test)


In [26]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)



In [27]:
print(r2, mae, rmse)


0.3786364409641103 0.15734187707455552 0.19957928146086878


In [28]:
preprocess = pipeline.named_steps["preprocess"]
num_features = numeric_features
cat_features = preprocess.named_transformers_["cat"].get_feature_names_out(categorical_features)
feature_names = np.concatenate([num_features, cat_features])


In [29]:
importances = pipeline.named_steps["model"].feature_importances_
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feature_importance_df.sort_values(by="Importance", ascending=False).head(10)


Unnamed: 0,Feature,Importance
2,No_of_Votes,0.426537
1,Meta_score,0.150792
0,Runtime,0.084155
115,Certificate_UA,0.012508
113,Certificate_U,0.006549
100,Released_Year_2020,0.005991
103,Certificate_A,0.00574
37,Released_Year_1957,0.004813
4180,Gross_Unknown,0.004403
883,Star1_Charles Chaplin,0.004062
