# Modeling for FlowMovieML

implement a XGBoost model for predict worldwide gross of a movie.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate

from xgboost import XGBRegressor

In [2]:
# Open the dataset

df = pd.read_csv('../dataset/clean_dataset.csv')

In [3]:
# Split the dataset

X = df.drop('worldwide_gross', axis=1)
y = df['worldwide_gross']

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=36)

In [4]:
# Normalize the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Pipeline

model = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', XGBRegressor())
])

results = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)

In [6]:
# Results

print(results)

In [8]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])

assert train_score > 0.6
assert test_score > 0.6

print('Train Score:', train_score)
print('Test Score:', test_score)

In [10]:
model.fit(X_train, y_train)

In [11]:
model.score(X_test, y_test)

In [13]:
# Save the model

from joblib import dump

dump(model, '../model/model.pkl')

In [18]:
# Save the scaler

dump(scaler, '../model/scaler.pkl')