In [1]:
%matplotlib inline

import os
import platform, sys
print(f"Python: {platform.platform()}, version: {sys.version[0:6]}")

import numpy as np
import pandas as pd
import seaborn as sns

print(f"numpy: {np.__version__}")
print(f"pandas: {pd.__version__}")
print(f"seaborn: {sns.__version__}")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# plotting library
import matplotlib
import matplotlib.pyplot as plt 
print(f"matplotlib: {matplotlib.__version__}")

# ML-models library
import sklearn as sk
print(f"scikit-learn: {sk.__version__}")

# statistics library
import statsmodels
print(f"statsmodels: {statsmodels.__version__}")

import ast

Python: Windows-11-10.0.26100-SP0, version: 3.12.7
numpy: 1.26.4
pandas: 2.2.2
seaborn: 0.13.2
matplotlib: 3.9.2
scikit-learn: 1.5.1
statsmodels: 0.14.2


In [2]:
LANGUAGES = ['aa', 'ab', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'cn', 'co', 'cr', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ik', 'io', 'is', 'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', 'ko', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mo', 'mr', 'ms', 'mt', 'my', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv', 'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw', 'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'wo', 'xh', 'xx', 'yi', 'yo', 'za', 'zh', 'zu']
GENRES = {'History', 'Action', 'Thriller', 'Science Fiction', 'Documentary', 'Western', 'Family', 'Drama', 'Fantasy', 'Comedy', 'Romance', 'TV Movie', 'Mystery', 'Adventure', 'Crime', 'Animation', 'Horror', 'Music', 'War'}


def load_csv_pd_data(path, filename):
    '''
    load_csv_pd_data() - construct a pandas DataFrame object 
           from data in a Excel csv-file `filename`,
           stored in a folder `path`.
    @returns: a pandas DataFrame
    '''
    csv_path = os.path.join(path, filename)
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"{csv_path} does not exist")

    return pd.read_csv(csv_path, sep=';')


def load_movies():
    '''
    load_movies() - construct a pandas DataFrame object of the movies
            with all the normal numeric fields formatted to be
            accepted by matplotlib, with a weighted average field for the votes,
            and with unused fields already removed.  
    @returns: a pandas DataFrame
    '''
    movies = load_csv_pd_data("input", "data_movies_clean.csv")

    # Format fields.
    movies['vote_average'] = movies['vote_average'].str.replace(',', '.').astype(float)
    movies['budget'] = movies['budget'].str.replace(',', '.').astype(float)
    movies['revenue'] = movies['revenue'].str.replace(',', '.').astype(float)
    movies['release_date'] = pd.to_datetime(movies['release_date'], format='mixed')

    # Drop unused fields.
    movies = movies.drop(['title', 'adult', 'popularity'], axis=1)

    # Merge vote_average and vote_count -> weighted_vote.
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(0.80)

    v = movies['vote_count']
    R = movies['vote_average']

    movies['weighted_vote'] = np.where(
        (v > 0) & (R > 0), (v / (v + m)) * R + (m / (v + m)) * C, 0
    )

    # Convert non-number values through hashes or binary values.
    # Genres.
    genres_values = {genre: 1 << i for i, genre in enumerate(sorted(GENRES))}

    movies["genre_names"] = movies["genre_names"].apply(ast.literal_eval)
    movies["genre_numeric"] = movies["genre_names"].apply(lambda row: sum(genres_values[key] for key in row))

    # Countries.
    movies["origin_country"] = movies["origin_country"].apply(ast.literal_eval)
    movies["country_hash"] = movies["origin_country"].apply(lambda row: hash(tuple(sorted(row))))

    # Languages.
    movies["original_language_numeric"] = movies["original_language"].apply(lambda row: LANGUAGES.index(row))


    # Verwijder records waar runtime 0 of negatief is, of meer dan 300 minuten is.
    movies.drop(movies[(movies['runtime'] <= 1) | (movies['runtime'] >= 500)].index, inplace=True)

    # Verwijder records waar budget 1 of minder is.
    movies.drop(movies[(movies['budget'] <= 1)].index, inplace=True)

    # Je kan zien hoe revenue nog veel verschil heeft tussen de indie films, met outliers in de miljarden. 
    # De grafiek is hierom biased. Echter is dit onze target dus is het minder relevant voor ons onderzoek.

    # Dit gooit een heleboel records weg, van 750k naar 333k.
    movies.drop(movies[(movies['vote_count'] < 1)].index, inplace=True)

    return movies



movies = load_movies()
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333686 entries, 85 to 944320
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   id                         333686 non-null  int64         
 1   original_language          333686 non-null  object        
 2   origin_country             333686 non-null  object        
 3   release_date               333686 non-null  datetime64[ns]
 4   genre_names                333686 non-null  object        
 5   production_company_names   333686 non-null  object        
 6   budget                     333686 non-null  float64       
 7   revenue                    333686 non-null  float64       
 8   runtime                    333686 non-null  int64         
 9   vote_average               333686 non-null  float64       
 10  vote_count                 333686 non-null  int64         
 11  weighted_vote              333686 non-null  float64     

## Pearson correlation between all pairs
The Pearson correlation measures the relationship between numeric features, showing how closely each variable, like revenue, moves in relation to others. High positive or negative values indicate strong associations, while values near zero suggest little linear connection.

In [3]:
# Get all columns that are numeric (int or floats)
num_cols = movies.select_dtypes(include=['int64', 'float64'])
correlation = num_cols.corr()

correlation['revenue'].sort_values(ascending=False)
features = ["id", "budget", "runtime", "vote_count", "vote_average", "weighted_vote", "genre_numeric", "country_hash", "original_language_numeric" ]


## Test/Train split
The dataset is split 80/20 with most data being used to train

In [4]:
X = movies[features]
y = movies['revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Length of train set: {len(X_train)}")
print(f"Length of test set: {len(X_test)}")


Length of train set: 266948
Length of test set: 66738


## Train the random forest regressor 

In [5]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

random_forest.fit(X_train, y_train)

### Feature importance
Feature importance signals how much each feature contributes to a model’s predictions, highlighting which features have the most influence on the target. Higher values indicate greater predictive power.

In [6]:
feature_importance = pd.Series(random_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
feature_importance

budget                       0.453813
vote_count                   0.342460
id                           0.052608
runtime                      0.042532
vote_average                 0.036199
weighted_vote                0.032649
genre_numeric                0.026397
original_language_numeric    0.006716
country_hash                 0.006626
dtype: float64

### Permutation importance
Permutation importance measures a feature’s effect on model performance by randomly shuffling its values and observing the drop in accuracy. Larger decreases indicate features that the model relies on more heavily for making predictions.

In [7]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(random_forest, X_test, y_test, n_repeats=5, random_state=42, n_jobs=4)
permutation_feature_importance = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)
print(permutation_feature_importance)


budget                       0.638642
vote_count                   0.494655
runtime                      0.057561
id                           0.054569
vote_average                 0.040255
genre_numeric                0.014626
original_language_numeric    0.009059
country_hash                 0.001016
weighted_vote               -0.003652
dtype: float64
