# Modeling for FlowMovieML

implement a XGBoost model for predict worldwide gross of a movie.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate

from xgboost import XGBRegressor

In [2]:
# Open the dataset

df = pd.read_csv('../dataset/clean_dataset.csv')

In [3]:
# Split the dataset

X = df.drop('worldwide_gross', axis=1)
y = df['worldwide_gross']

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=36)

In [4]:
# Normalize the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Pipeline

model = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', XGBRegressor())
])

results = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)

In [6]:
# Results

print(results)

{'fit_time': array([0.05416131, 0.0451839 , 0.0578258 , 0.04241371, 0.04804778,
       0.08524489, 0.08053565, 0.04709196, 0.04548883, 0.04931593]), 'score_time': array([0.00103378, 0.00064802, 0.00070214, 0.00069094, 0.00077224,
       0.0027442 , 0.00088239, 0.00056291, 0.00056314, 0.00052595]), 'test_score': array([0.90683573, 0.93228614, 0.90016562, 0.93130755, 0.89380807,
       0.86274135, 0.72076213, 0.85153157, 0.90454036, 0.90079343]), 'train_score': array([0.99984723, 0.99978495, 0.99976867, 0.99982655, 0.99980801,
       0.99981391, 0.99990463, 0.99974018, 0.99980903, 0.99983758])}


In [8]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])

assert train_score > 0.6
assert test_score > 0.6

print('Train Score:', train_score)
print('Test Score:', test_score)

Train Score: 0.9998140752315521
Test Score: 0.8804771959781647


In [10]:
model.fit(X_train, y_train)

In [11]:
model.score(X_test, y_test)

0.8623724579811096

In [13]:
# Save the model

from joblib import dump

dump(model, '../model/model.pkl')

['../model/model.pkl']

In [18]:
# Save the scaler

dump(scaler, '../model/scaler.pkl')

['../model/scaler.pkl']