python version 3.13.9

In [None]:
%matplotlib inline

import os
import platform, sys
print(f"Python: {platform.platform()}, version: {sys.version[0:6]}")

import numpy as np
import pandas as pd
import seaborn as sns

print(f"numpy: {np.__version__}")
print(f"pandas: {pd.__version__}")
print(f"seaborn: {sns.__version__}")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# plotting library
import matplotlib
import matplotlib.pyplot as plt 
print(f"matplotlib: {matplotlib.__version__}")

# ML-models library
import sklearn as sk
print(f"scikit-learn: {sk.__version__}")

# statistics library
import statsmodels
print(f"statsmodels: {statsmodels.__version__}")

import ast
import pickle

Python: Windows-11-10.0.26100-SP0, version: 3.13.9
numpy: 2.3.5
pandas: 2.3.3
seaborn: 0.13.2
matplotlib: 3.10.7
scikit-learn: 1.8.0
statsmodels: 0.14.6


## 1. Laad de data

In [None]:
LANGUAGES = ['aa', 'ab', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'cn', 'co', 'cr', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ik', 'io', 'is', 'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', 'ko', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mo', 'mr', 'ms', 'mt', 'my', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv', 'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw', 'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'wo', 'xh', 'xx', 'yi', 'yo', 'za', 'zh', 'zu']
GENRES = {'History', 'Action', 'Thriller', 'Science Fiction', 'Documentary', 'Western', 'Family', 'Drama', 'Fantasy', 'Comedy', 'Romance', 'TV Movie', 'Mystery', 'Adventure', 'Crime', 'Animation', 'Horror', 'Music', 'War'}


def load_csv_pd_data(path, filename, sep=';'):
    '''
    load_csv_pd_data() - construct a pandas DataFrame object 
           from data in a Excel csv-file `filename`,
           stored in a folder `path`.
    @returns: a pandas DataFrame
    '''
    csv_path = os.path.join(path, filename)
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"{csv_path} does not exist")

    return pd.read_csv(csv_path, sep=sep)


def load_movies():
    '''
    load_movies() - construct a pandas DataFrame object of the movies
            with all the normal numeric fields formatted to be
            accepted by matplotlib, with a weighted average field for the votes,
            and with unused fields already removed.  
    @returns: a pandas DataFrame
    '''
    # First check if the output csv already exists.
    # If it does, use that data instead ^^.
    output_path = os.path.join("output", "movies_cleaned.csv")
    if os.path.exists(output_path):
        return load_csv_pd_data("output", "movies_cleaned.csv", ",")


    movies = load_csv_pd_data("input", "data_movies_clean.csv")

    # Format fields.
    movies['vote_average'] = movies['vote_average'].str.replace(',', '.').astype(float)
    movies['budget'] = movies['budget'].str.replace(',', '.').astype(float)
    movies['revenue'] = movies['revenue'].str.replace(',', '.').astype(float)
    movies['release_date'] = pd.to_datetime(movies['release_date'], format='mixed')

    # Drop unused fields.
    movies = movies.drop(['title', 'adult', 'popularity'], axis=1)

    # Merge vote_average and vote_count -> weighted_vote.
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(0.80)

    v = movies['vote_count']
    R = movies['vote_average']

    movies['weighted_vote'] = np.where(
        (v > 0) & (R > 0), (v / (v + m)) * R + (m / (v + m)) * C, 0
    )

    # Convert non-number values through hashes or binary values.
    # Genres.
    genres_values = {genre: 1 << i for i, genre in enumerate(sorted(GENRES))}

    movies["genre_names"] = movies["genre_names"].apply(ast.literal_eval)
    movies["genre_numeric"] = movies["genre_names"].apply(lambda row: sum(genres_values[key] for key in row))

    # Countries.
    movies["origin_country"] = movies["origin_country"].apply(ast.literal_eval)
    movies["country_hash"] = movies["origin_country"].apply(lambda row: hash(tuple(sorted(row))))

    # Languages.
    movies["original_language_numeric"] = movies["original_language"].apply(lambda row: LANGUAGES.index(row))


    # Verwijder records waar runtime 0 of negatief is, of meer dan 300 minuten is.
    movies.drop(movies[(movies['runtime'] <= 1) | (movies['runtime'] >= 500)].index, inplace=True)

    # Verwijder records waar budget 1 of minder is.
    movies.drop(movies[(movies['budget'] <= 1)].index, inplace=True)

    # Je kan zien hoe revenue nog veel verschil heeft tussen de indie films, met outliers in de miljarden. 
    # De grafiek is hierom biased. Echter is dit onze target dus is het minder relevant voor ons onderzoek.

    # Dit gooit een heleboel records weg, van 750k naar 333k.
    movies.drop(movies[(movies['vote_count'] < 1)].index, inplace=True)

    return movies



movies = load_movies()
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333686 entries, 0 to 333685
Data columns (total 15 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         333686 non-null  int64  
 1   original_language          333686 non-null  object 
 2   origin_country             333686 non-null  object 
 3   release_date               333686 non-null  object 
 4   genre_names                333686 non-null  object 
 5   production_company_names   333686 non-null  object 
 6   budget                     333686 non-null  float64
 7   revenue                    333686 non-null  float64
 8   runtime                    333686 non-null  int64  
 9   vote_average               333686 non-null  float64
 10  vote_count                 333686 non-null  int64  
 11  weighted_vote              333686 non-null  float64
 12  genre_numeric              333686 non-null  int64  
 13  country_hash               33

## 2. Random Forest

Een Random Forest is een groep van meerdere beslissingsbomen die gezamenlijk een voorspelling doen. Het model bouwt meerdere bomen en maakt de uiteindelijke voorspelling op basis van het gemiddelde van alle bomen, wat helpt om overfitting te voorkomen en de nauwkeurigheid te verbeteren.

Wij gebruiken de Random Forest vooral om de feature importance te vinden. Hier hopen wij uit te krijgen welke features het meest relevant zijn, en welke genegeerd kunnen worden.

## 3. Pearson correlatie
De Pearson-correlatie meet de relatie tussen numerieke features en laat zien hoe nauw elke variabele, zoals revenue, samenhangt met andere features. Hoge positieve of negatieve waarden duiden op sterke verbanden, terwijl waarden dicht bij null wijzen op weinig lineaire samenhang.


In [None]:
# Get all columns that are numeric (int or floats)
num_cols = movies.select_dtypes(include=['int64', 'float64'])
correlation = num_cols.corr()

correlation['revenue'].sort_values(ascending=False)

revenue                      1.000000
vote_count                   0.701763
budget                       0.644990
weighted_vote                0.142028
vote_average                 0.077651
runtime                      0.059932
country_hash                 0.023246
genre_numeric                0.010243
original_language_numeric   -0.001429
id                          -0.005083
Name: revenue, dtype: float64

## 4. Test/Train split

We splitsen de gegevens op in trainings- en testsets (80/20 split) en trainen een lineair regressiemodel.

Voor dit lineaire regressiemodel richten we ons op continue numerieke kenmerken die een geldige lineaire relatie hebben. Omdat wij hopen de nuttige features te vinden, zullen wij ze allemaal testen, ook de features waarvan men zou denken dat ze niet relevant zijn, zoals id.

In [None]:
features = ["id", "budget", "runtime", "vote_count", "vote_average", "weighted_vote", "genre_numeric", "country_hash", "original_language_numeric" ]

X = movies[features]
y = movies['revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Length of train set: {len(X_train)}")
print(f"Length of test set: {len(X_test)}")

Length of train set: 266948
Length of test set: 66738


## 5. Train de regressor 

In deze stap maken we een Random Forest Regressor en trainen we het model met de volgende hyperparameters:
- `n_estimators=200`: Het aantal bomen in het bos. 
- `max_depth=None`: Dit betekent dat de bomen niet beperkt worden in diepte, wat ze in staat stelt om volledig te groeien en complexe patronen te leren.
- `n_jobs=-1`: Dit zorgt ervoor dat het model gebruik maakt van alle beschikbare CPU-kernen om de training te versnellen.
- `random_state=42`: Dit zorgt ervoor dat de willekeurige processen in het model elke keer hetzelfde zijn voor reproductie van de resultaten.

We trainen het model op de trainingsdata (X_train en y_train), zodat het kan leren van de data

In [None]:
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


## 6A. Feature importance

De feature importance geeft aan hoeveel elke feature bijdraagt ​​aan de voorspellingen van een model, en kijkt naar welke features de grootste invloed hebben op het doel. Hogere waarden duiden op een grotere voorspellende kracht.

In [None]:
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)

budget                       0.453768
vote_count                   0.342372
id                           0.052566
runtime                      0.042978
vote_average                 0.035934
weighted_vote                0.032545
genre_numeric                0.026140
country_hash                 0.006869
original_language_numeric    0.006828
dtype: float64


## 6B. Permutation importance

Permutatie importacne meet het effect van een feature op de modelprestaties door de waarden ervan random te herschikken en de nauwkeurigheid te observeren. Grotere afnames duiden op kenmerken waarop het model sterker vertrouwt bij het maken van voorspellingen.


In [None]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42, n_jobs=4)
permutation_feature_importance = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)

print(permutation_feature_importance)

budget                       0.638642
vote_count                   0.494655
runtime                      0.057561
id                           0.054569
vote_average                 0.040255
genre_numeric                0.014626
original_language_numeric    0.009059
country_hash                 0.001016
weighted_vote               -0.003652
dtype: float64


## 7. Sla het model op

In deze stap slaan we het getrainde model op in een bestand met de naam `random_forest_model.pkl`. Dit zorgt ervoor dat we het model later opnieuw kunnen gebruiken zonder het opnieuw te hoeven trainen, wat tijd en rekenkracht bespaart.

In [None]:
os.makedirs("output/models/", exist_ok=True)
with open('output/models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)

## 8. Conclusie

De random forest heeft goed gewerkt tot het vinden van de relevante features. Deze features zijn budget, vote_count, id*, runtime, vote_average.
Deze features zullen gebruikt worden bij de andere modellen.

*Id zal echter niet worden meegenomen. Het model dacht dat dit relevant was, maar real-world ervaring toont dat dit niet het geval is.
