# Decision Tree Classifier

In [6]:
import pandas as pd

df = pd.read_csv('data/movie_metadata_final.csv')

In [7]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'genre', 'description',
       'director', 'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'financial_success', 'ROI', 'age_rating_number',
       'genre_action', 'genre_adventure', 'genre_animation', 'genre_biography',
       'genre_comedy', 'genre_crime', 'genre_drama', 'genre_family',
       'genre_fantasy', 'genre_film-noir', 'genre_history', 'genre_horror',
       'genre_music', 'genre_musical', 'genre_mystery', 'genre_romance',
       'genre_sci-fi', 'genre_sport', 'genre_thriller', 'genre_war',
       'genre_western'],
      dtype='object')

In [8]:
# check for NaN values
print(df[['imdbid', 'title', 'year', 'age_rating', 'genre', 'description',
       'director', 'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'age_rating_number', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_biography', 'genre_comedy',
       'genre_crime', 'genre_drama', 'genre_family', 'genre_fantasy',
       'genre_film-noir', 'genre_history', 'genre_horror', 'genre_music',
       'genre_musical', 'genre_mystery', 'genre_romance', 'genre_sci-fi',
       'genre_sport', 'genre_thriller', 'genre_war', 'genre_western',
       'financial_success', 'ROI']].isna().sum())

imdbid               0
title                0
year                 0
age_rating           0
genre                0
description          0
director             0
runtime_minutes      0
production_budget    0
domestic_gross       0
worldwide_gross      0
age_rating_number    0
genre_action         0
genre_adventure      0
genre_animation      0
genre_biography      0
genre_comedy         0
genre_crime          0
genre_drama          0
genre_family         0
genre_fantasy        0
genre_film-noir      0
genre_history        0
genre_horror         0
genre_music          0
genre_musical        0
genre_mystery        0
genre_romance        0
genre_sci-fi         0
genre_sport          0
genre_thriller       0
genre_war            0
genre_western        0
financial_success    0
ROI                  0
dtype: int64


In [9]:
# no NaN values (see above), but still got NaN error when fitting the model, converting columns to numeric

# convert to numeric, forcing errors to NaN
df['runtime_minutes'] = pd.to_numeric(df['runtime_minutes'], errors='coerce')
df['production_budget'] = pd.to_numeric(df['production_budget'], errors='coerce')
df['age_rating_number'] = pd.to_numeric(df['age_rating_number'], errors='coerce')

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# define features and target
X = df[['runtime_minutes', 'production_budget', 'age_rating_number', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_biography', 'genre_comedy',
       'genre_crime', 'genre_drama', 'genre_family', 'genre_fantasy',
       'genre_film-noir', 'genre_history', 'genre_horror', 'genre_music',
       'genre_musical', 'genre_mystery', 'genre_romance', 'genre_sci-fi',
       'genre_sport', 'genre_thriller', 'genre_war', 'genre_western']]
y = df[['financial_success']]

In [11]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# initialize Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)

In [13]:
# train model
model.fit(X_train, y_train)

In [14]:
# predict on the test set
y_pred = model.predict(X_test)

In [15]:
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.56
