In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
# Open csv file and view it
file_path = "../Resources/cleaned.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Title,Rated,Released,Runtime,Genre,Director,Actors,Language,Country,Awards,...,SPORT,WAR,MUSICAL,WESTERN,NEWS,REALITY-TV,Unnamed: 44,GAME-SHOW,TALK-SHOW,BoxOffice_Inf_Adj
0,The Other Side of the Wind,R,2018-11-02,122,Drama,Orson Welles,"John Huston, Oja Kodar, Peter Bogdanovich","English, German","France, Iran, United States",9 wins & 8 nominations,...,,,,,,,,,,
1,Grizzly II: Revenge,Not Rated,2021-01-08,74,"Horror, Music, Thriller",André Szöts,"George Clooney, Laura Dern, Charlie Sheen",English,United States,,...,,,,,,,,,,
2,Mariette in Ecstasy,PG-13,2019-11-13,101,Drama,John Bailey,"Geraldine O'Rawe, Eva Marie Saint, Alex Appel",English,United States,,...,,,,,,,,,,
3,Cooper and Hemingway: The True Gen,Unrated,2013-09-27,120,Documentary,John Mulholland,"Sam Waterston, Len Cariou, Nancy Crawford",English,United States,,...,,,,,,,,,,
4,Heaven & Hell,TV-MA,2018-11-06,104,Drama,Stuart Paul,"Cheryl M. Lynn, Chase Nash, Burt Ward",English,United States,,...,,,,,,,,,,


In [3]:
# Change the "Released" column from a string to a datetime format
df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df['Year'] = df['Released'].dt.year

In [4]:
# Check to make sure the "Released" column is not a string anymore
print(df['Released'].dtype)
print(df['Year'].dtype)

datetime64[ns]
int32


In [5]:
# Count how many times each director appears in the dataset
director_counts = df['Director'].value_counts()

# Display the top 15
top_directors = director_counts.head(15).index
top_directors

Index(['Jeremy Norrie', 'Chris Stokes', 'Tyler Perry', 'Jared Cohn',
       'Michael Feifer', 'David DeCoteau', 'Alex Gibney', 'Charles Band',
       'Sean McNamara', 'Timothy Woodward Jr.', 'Steven Soderbergh',
       'Ron Howard', 'Philip Gardiner', 'Mark Polonia', 'J. Horton'],
      dtype='object', name='Director')

In [6]:
# Filter the dataset to only include rows where the director is in the top 15
df['Top_Director'] = df['Director'].apply(lambda x: x if x in top_directors else None)

# Create the dummy columns for Top_Director
director_dummies = pd.get_dummies(df['Top_Director'], prefix='Director')

# Ensure all top directors are represented as columns, even if they have no movies
director_dummies = director_dummies.reindex(columns=[f'Director_{director}' for director in top_directors] + ['Director_Other'], fill_value=0)

# Convert boolean values to 0 and 1
director_dummies = director_dummies.astype(int)
director_dummies

Unnamed: 0,Director_Jeremy Norrie,Director_Chris Stokes,Director_Tyler Perry,Director_Jared Cohn,Director_Michael Feifer,Director_David DeCoteau,Director_Alex Gibney,Director_Charles Band,Director_Sean McNamara,Director_Timothy Woodward Jr.,Director_Steven Soderbergh,Director_Ron Howard,Director_Philip Gardiner,Director_Mark Polonia,Director_J. Horton,Director_Other
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17975,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17976,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
# Split the 'Actors' column into individual actors and count how many times each actor appears
actor_list = df['Actors'].str.split(', ').explode()
actor_counts = actor_list.value_counts()

# Filter actors that appear at least 3 times
top_actors = actor_counts[actor_counts >= 3].head(28).index
print(top_actors)

# Create a 'Top_Actors' column where only top actors are kept
df['Top_Actors'] = df['Actors'].apply(
    lambda x: [actor.strip() for actor in x.split(', ') if actor.strip() in top_actors] if isinstance(x, str) else []
)

# Check the result 
df[['Title', 'Top_Actors']].head()

Index(['Eric Roberts', 'Tom Sizemore', 'Nicolas Cage', 'Bruce Willis',
       'James Franco', 'Mark Wahlberg', 'Danny Trejo', 'Kevin Sorbo',
       'Kevin Murphy', 'Samuel L. Jackson', 'Bill Corbett', 'Kevin Hart',
       'Liam Neeson', 'Michael J. Nelson', 'Dwayne Johnson', 'J.K. Simmons',
       'Ryan Reynolds', 'Frank Grillo', 'Dean Cain', 'Robert De Niro',
       'Matt Damon', 'Tom Hanks', 'Michael Madsen', 'Dolph Lundgren',
       'Woody Harrelson', 'Michael Shannon', 'Ethan Hawke', 'Anna Kendrick'],
      dtype='object', name='Actors')


Unnamed: 0,Title,Top_Actors
0,The Other Side of the Wind,[]
1,Grizzly II: Revenge,[]
2,Mariette in Ecstasy,[]
3,Cooper and Hemingway: The True Gen,[]
4,Heaven & Hell,[]


In [8]:
# Exploding the list to create dummies for each actor
actor_dummies = pd.get_dummies(df['Top_Actors'].explode(), prefix='Actor')

# Ensure all top actors are represented as columns, even if they have no movies
actor_dummies = actor_dummies.reindex(columns=[f'Actor_{actor}' for actor in top_actors], fill_value=0)

# Add an 'Actor_Other' column for those movies where the actors aren't in the top actors list 
actor_dummies['Actor_Other'] = df['Top_Actors'].apply(lambda x: 1 if len(x) == 0 else 0)

# Convert boolean values to 0 and 1
actor_dummies = actor_dummies.astype(int)
actor_dummies

Unnamed: 0,Actor_Eric Roberts,Actor_Tom Sizemore,Actor_Nicolas Cage,Actor_Bruce Willis,Actor_James Franco,Actor_Mark Wahlberg,Actor_Danny Trejo,Actor_Kevin Sorbo,Actor_Kevin Murphy,Actor_Samuel L. Jackson,...,Actor_Robert De Niro,Actor_Matt Damon,Actor_Tom Hanks,Actor_Michael Madsen,Actor_Dolph Lundgren,Actor_Woody Harrelson,Actor_Michael Shannon,Actor_Ethan Hawke,Actor_Anna Kendrick,Actor_Other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
17974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
17975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
17976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
# Split 'Genre' column into individual genres using get_dummies
genre_dummies = df['Genre'].str.get_dummies(sep=', ')
genre_dummies

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17973,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17974,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17975,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
17976,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df.columns

Index(['Title', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Actors',
       'Language', 'Country', 'Awards', 'Metascore', 'imdbRating', 'imdbVotes',
       'imdbID', 'BoxOffice', 'Rotten_Tomatoes', 'R_Year', 'R_Month', 'R_Day',
       'Rounded Rating', 'Good_Movie', 'DRAMA', 'HORROR', 'MUSIC', 'THRILLER',
       'DOCUMENTARY', 'ANIMATION', 'ACTION', 'ADVENTURE', 'BIOGRAPHY',
       'COMEDY', 'FANTASY', 'MYSTERY', 'ROMANCE', 'SCI-FI', 'CRIME', 'FAMILY',
       'HISTORY', 'SPORT', 'WAR', 'MUSICAL', 'WESTERN', 'NEWS', 'REALITY-TV',
       'Unnamed: 44', 'GAME-SHOW', 'TALK-SHOW', 'BoxOffice_Inf_Adj', 'Year',
       'Top_Director', 'Top_Actors'],
      dtype='object')

In [11]:
columns_to_drop = ['Rated', 'Genre', 'Actors', 'Director', 'Released', 'Awards', 'R_Year', 'R_Month', 'R_Day', 'Language', 'Country', 'imdbID', 'DRAMA', 'HORROR', 'MUSIC', 'THRILLER',
                   'DOCUMENTARY', 'ANIMATION', 'ACTION', 'ADVENTURE', 'BIOGRAPHY', 'COMEDY', 'FANTASY', 'MYSTERY', 'ROMANCE', 'SCI-FI', 'CRIME', 'FAMILY',
                   'HISTORY', 'SPORT', 'WAR', 'MUSICAL', 'WESTERN', 'NEWS', 'REALITY-TV', 'Unnamed: 44', 'GAME-SHOW', 'TALK-SHOW', 'Top_Director', 'Top_Actors']
df = df.drop(columns=columns_to_drop)
df.columns

Index(['Title', 'Runtime', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice',
       'Rotten_Tomatoes', 'Rounded Rating', 'Good_Movie', 'BoxOffice_Inf_Adj',
       'Year'],
      dtype='object')

In [12]:
df.dtypes

Title                 object
Runtime                int64
Metascore            float64
imdbRating           float64
imdbVotes             object
BoxOffice            float64
Rotten_Tomatoes      float64
Rounded Rating       float64
Good_Movie              bool
BoxOffice_Inf_Adj    float64
Year                   int32
dtype: object

In [13]:
# Clean 'imdbVotes' column to remove commas and convert to numeric
df['imdbVotes'] = df['imdbVotes'].replace({',': ''}, regex=True)  # Remove commas
df['imdbVotes'] = pd.to_numeric(df['imdbVotes'], errors='coerce')  # Convert to numeric, coercing errors to NaN
# Convert 'Good_Movie' from boolean to numeric (True becomes 1, False becomes 0)
df['Good_Movie'] = df['Good_Movie'].astype(int)

In [14]:
df.dtypes

Title                 object
Runtime                int64
Metascore            float64
imdbRating           float64
imdbVotes            float64
BoxOffice            float64
Rotten_Tomatoes      float64
Rounded Rating       float64
Good_Movie             int32
BoxOffice_Inf_Adj    float64
Year                   int32
dtype: object

In [15]:
# Combine the get_dummies created with the original dataframe
df_updated = pd.concat([df, genre_dummies, director_dummies, actor_dummies], axis=1)
df_updated

Unnamed: 0,Title,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Rotten_Tomatoes,Rounded Rating,Good_Movie,BoxOffice_Inf_Adj,...,Actor_Robert De Niro,Actor_Matt Damon,Actor_Tom Hanks,Actor_Michael Madsen,Actor_Dolph Lundgren,Actor_Woody Harrelson,Actor_Michael Shannon,Actor_Ethan Hawke,Actor_Anna Kendrick,Actor_Other
0,The Other Side of the Wind,122,79.0,6.7,8188.0,,82.0,7.0,0,,...,0,0,0,0,0,0,0,0,0,1
1,Grizzly II: Revenge,74,7.0,2.7,1801.0,,9.0,3.0,0,,...,0,0,0,0,0,0,0,0,0,1
2,Mariette in Ecstasy,101,,7.4,72.0,,,7.0,1,,...,0,0,0,0,0,0,0,0,0,1
3,Cooper and Hemingway: The True Gen,120,54.0,7.3,100.0,,,7.0,0,,...,0,0,0,0,0,0,0,0,0,1
4,Heaven & Hell,104,,3.1,151.0,,,3.0,0,,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17973,VFW,92,72.0,6.1,9831.0,,85.0,6.0,0,,...,0,0,0,0,0,0,0,0,0,1
17974,India Sweets and Spices,101,57.0,6.1,893.0,288714.0,83.0,6.0,0,3.094647e+05,...,0,0,0,0,0,0,0,0,0,1
17975,Pilgrim's Progress,108,,6.3,921.0,1294596.0,60.0,6.0,0,1.588458e+06,...,0,0,0,0,0,0,0,0,0,1
17976,Coffee & Kareem,88,35.0,5.2,14780.0,,21.0,5.0,0,,...,0,0,0,0,0,0,0,0,0,1


In [16]:
df_updated.columns

Index(['Title', 'Runtime', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice',
       'Rotten_Tomatoes', 'Rounded Rating', 'Good_Movie', 'BoxOffice_Inf_Adj',
       'Year', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Talk-Show', 'Thriller',
       'War', 'Western', 'Director_Jeremy Norrie', 'Director_Chris Stokes',
       'Director_Tyler Perry', 'Director_Jared Cohn',
       'Director_Michael Feifer', 'Director_David DeCoteau',
       'Director_Alex Gibney', 'Director_Charles Band',
       'Director_Sean McNamara', 'Director_Timothy Woodward Jr.',
       'Director_Steven Soderbergh', 'Director_Ron Howard',
       'Director_Philip Gardiner', 'Director_Mark Polonia',
       'Director_J. Horton', 'Director_Other', 'Actor_Eric Roberts',
       'Actor_Tom Sizemore', 'Actor_Nicolas Cage

In [17]:
# Now add the 'Year' column back to df_updated
df_updated['Year'] = df['Year']  # Ensure 'Year' is still in df_updated
df_updated[['Title', 'Year']].head()

Unnamed: 0,Title,Year
0,The Other Side of the Wind,2018
1,Grizzly II: Revenge,2021
2,Mariette in Ecstasy,2019
3,Cooper and Hemingway: The True Gen,2013
4,Heaven & Hell,2018


In [18]:
# Check for missing values in each column
missing_values = df_updated.isnull().sum()

# Display the columns with missing values
missing_values[missing_values > 0]

Metascore            11460
imdbRating             727
imdbVotes              173
BoxOffice            13210
Rotten_Tomatoes      10030
Rounded Rating         727
BoxOffice_Inf_Adj    13453
dtype: int64

In [19]:
# Fill missing values in the 'Rotten_Tomatoes' column with the mean
df_updated['Rotten_Tomatoes'] = df_updated['Rotten_Tomatoes'].fillna(df_updated['Rotten_Tomatoes'].mean())

In [20]:
# Check again for missing values
missing_values_after = df_updated.isnull().sum()
missing_values_after[missing_values_after > 0]

Metascore            11460
imdbRating             727
imdbVotes              173
BoxOffice            13210
Rounded Rating         727
BoxOffice_Inf_Adj    13453
dtype: int64

In [21]:
# Fill missing values in all other columns with 0
df_updated = df_updated.fillna(0)

In [22]:
# Check again for missing values
missing_values_after = df_updated.isnull().sum()
missing_values_after[missing_values_after > 0]

Series([], dtype: int64)

In [23]:
# Define success based on Rotten Tomatoes score
threshold = 70 
df_updated['Success'] = (df_updated['Rotten_Tomatoes'] >= threshold).astype(int)

In [24]:
# Count the total number of movies using the 'Title' column
total_movies_count = df['Title'].count()
total_movies_count 

17978

In [25]:
# Count the total number of movies with a Rotten Tomatoes score of 70 or more
successful_movies_count = df[df['Rotten_Tomatoes'] >= 70]['Rotten_Tomatoes'].count()
successful_movies_count 

3796

In [26]:
df_updated = df_updated.drop(columns=['Rotten_Tomatoes'])
df_updated.columns

Index(['Title', 'Runtime', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice',
       'Rounded Rating', 'Good_Movie', 'BoxOffice_Inf_Adj', 'Year', 'Action',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Sport', 'Talk-Show', 'Thriller', 'War', 'Western',
       'Director_Jeremy Norrie', 'Director_Chris Stokes',
       'Director_Tyler Perry', 'Director_Jared Cohn',
       'Director_Michael Feifer', 'Director_David DeCoteau',
       'Director_Alex Gibney', 'Director_Charles Band',
       'Director_Sean McNamara', 'Director_Timothy Woodward Jr.',
       'Director_Steven Soderbergh', 'Director_Ron Howard',
       'Director_Philip Gardiner', 'Director_Mark Polonia',
       'Director_J. Horton', 'Director_Other', 'Actor_Eric Roberts',
       'Actor_Tom Sizemore', 'Actor_Nicolas Cage', 'Actor_Bruce Wil

In [27]:
# Save 'Title' column separately to preserve it later for predictions
titles = df_updated['Title']

In [28]:
# Prepare features and target for model
X = df_updated.drop(columns=['Success', 'Title'])  # Drop 'Title' and 'Success' from features
y = df_updated['Success']

In [29]:
# Create a separate dataframe for movies before the year of 2025
train_data = df_updated[df_updated['Year'] < 2025]
train_data.head()

Unnamed: 0,Title,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Rounded Rating,Good_Movie,BoxOffice_Inf_Adj,Year,...,Actor_Matt Damon,Actor_Tom Hanks,Actor_Michael Madsen,Actor_Dolph Lundgren,Actor_Woody Harrelson,Actor_Michael Shannon,Actor_Ethan Hawke,Actor_Anna Kendrick,Actor_Other,Success
0,The Other Side of the Wind,122,79.0,6.7,8188.0,0.0,7.0,0,0.0,2018,...,0,0,0,0,0,0,0,0,1,1
1,Grizzly II: Revenge,74,7.0,2.7,1801.0,0.0,3.0,0,0.0,2021,...,0,0,0,0,0,0,0,0,1,0
2,Mariette in Ecstasy,101,0.0,7.4,72.0,0.0,7.0,1,0.0,2019,...,0,0,0,0,0,0,0,0,1,0
3,Cooper and Hemingway: The True Gen,120,54.0,7.3,100.0,0.0,7.0,0,0.0,2013,...,0,0,0,0,0,0,0,0,1,0
4,Heaven & Hell,104,0.0,3.1,151.0,0.0,3.0,0,0.0,2018,...,0,0,0,0,0,0,0,0,1,0


In [30]:
# Save dataframe to a csv file
train_data.to_csv('train_data.csv', index=False)

In [31]:
# Create a separate dataframe for movies in the year of 2025
test_data = df_updated[df_updated['Year'] == 2025]
test_data.head()

Unnamed: 0,Title,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Rounded Rating,Good_Movie,BoxOffice_Inf_Adj,Year,...,Actor_Matt Damon,Actor_Tom Hanks,Actor_Michael Madsen,Actor_Dolph Lundgren,Actor_Woody Harrelson,Actor_Michael Shannon,Actor_Ethan Hawke,Actor_Anna Kendrick,Actor_Other,Success
339,Flight Risk,91,38.0,5.5,5632.0,20907918.0,6.0,0,0.0,2025,...,0,0,0,0,0,0,0,0,0,0
524,Becoming Led Zeppelin,121,57.0,0.0,43.0,0.0,0.0,0,0.0,2025,...,0,0,0,0,0,0,0,0,1,1
829,Dog Man,89,66.0,6.5,6663.0,90317665.0,6.0,0,0.0,2025,...,0,0,0,0,0,0,0,0,1,1
1496,Mickey 17,139,75.0,7.3,2466.0,0.0,7.0,0,0.0,2025,...,0,0,0,0,0,0,0,0,1,1
2494,The Parenting,94,40.0,5.7,2016.0,0.0,6.0,0,0.0,2025,...,0,0,0,0,0,0,0,0,1,0


In [32]:
# Save dataframe to a csv file
test_data.to_csv('test_data.csv', index=False)

In [33]:
# Define features and target 
X_train = train_data.drop(columns=['Success', 'Title'])  
y_train = train_data['Success']
X_test = test_data.drop(columns=['Success', 'Title'])  
y_test = test_data['Success']

In [34]:
# Standardize numeric features
numeric_features = [
    'Runtime', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice', 
    'Rounded Rating', 'Good_Movie', 'BoxOffice_Inf_Adj', 'Year'
]
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [35]:
# Define Random Forest with constraints to prevent overfitting
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)

In [36]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean():.4f} ± {cross_val_scores.std():.4f}")

Cross-validation Accuracy: 0.9067 ± 0.0049


In [37]:
# Train model
model.fit(X_train, y_train)

In [38]:
# Evaluate on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.8226
              precision    recall  f1-score   support

           0       0.91      0.80      0.85        40
           1       0.70      0.86      0.78        22

    accuracy                           0.82        62
   macro avg       0.81      0.83      0.81        62
weighted avg       0.84      0.82      0.83        62



In [39]:
# Feature importance analysis
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
feature_importances.head(10)

Unnamed: 0,Feature,Importance
1,Metascore,0.338733
2,imdbRating,0.136921
3,imdbVotes,0.131218
5,Rounded Rating,0.092306
4,BoxOffice,0.06469
7,BoxOffice_Inf_Adj,0.053241
8,Year,0.043925
0,Runtime,0.041011
15,Documentary,0.022029
16,Drama,0.007454


In [40]:
# Linking predictions back to movie titles
results = pd.DataFrame({'Title': test_data['Title'], 'Predicted_Success': y_pred})
results.head()

Unnamed: 0,Title,Predicted_Success
339,Flight Risk,0
524,Becoming Led Zeppelin,0
829,Dog Man,1
1496,Mickey 17,1
2494,The Parenting,0
