In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pretty display for notebooks
%matplotlib inline
from pprint import pprint

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import train_test_split, learning_curve, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer

from sklearn.preprocessing import RobustScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import f_regression, SelectKBest

#from common.CommonFunctions import train_and_eval

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
flo_columns = [
    'id',
    'Age',
    'hasStar',
    'NumStar',
    'hashomepage',
    'isinCollection',
    'Action',
    'Family',
    'Adventure',
    'History',
    'Drama',
    'Science Fiction',
    'Romance',
    'Fantasy',
    'Horror',
    'Animation',
    'numberGenres']

data = pd.read_csv('../data/train.csv')
flo_features = pd.read_csv('./exported_features/all_features.csv')[flo_columns]
marcel_keywords = pd.read_csv('../data/marcel_features.csv').drop('Unnamed: 0', axis=1)

data['popularity'] = np.log1p(data.popularity)
data['budget'] = np.log1p(data.budget)
data = data.merge(marcel_keywords, on=['id'], how='left').merge(flo_features, on=['id'], how='left')
data.runtime = data.runtime.fillna(np.mean(data.runtime))

features = list(flo_features.columns) + ['popularity', 'budget', 'has_top_keyword', 'runtime']
features.remove('id')

features_df = data[features]
target = data.revenue

X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.2, random_state=42)

In [3]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_trainPoly = poly.fit_transform(X_train)
#poly.get_feature_names()

In [28]:
test = SelectKBest(score_func=f_regression, k=10)
fit = test.fit(X_trainPoly, np.log1p(y_train))
cols = test.get_support(indices=True)
cols

  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


numpy.ndarray

In [18]:
gbr = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='ls', max_depth=5, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=400, n_iter_no_change=None, presort='auto',
              random_state=None, subsample=1.0, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [6]:
gbr.fit(X_trainPoly, np.log1p(y_train))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=400, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [38]:
def train_and_eval(model, X_train, y_train, scoring):
    cv_results = cross_validate(model, 
                                X_train, 
                                np.log1p(y_train), 
                                cv=10, 
                                scoring=scoring, 
                                return_train_score=True)
    
    mean_train = np.mean(cv_results['train_score'])
    mean_cv = np.mean(cv_results['test_score'])
    std_train = np.std(cv_results['train_score'])
    std_cv = np.std(cv_results['test_score'])

    result_summary = {
        'model': model,
        'mean_train_score': mean_train,
        'mean_cv_score': mean_cv,
        'std_train_score': std_train,
        'std_cv_score': std_cv
    }
    
    return result_summary

def my_score(y_true, y_pred):
    '''
    Compute the RMSLE on the actual target values, not the log target
    '''
    if any(np.expm1(y_pred) < 0):
        return np.Infinity
    else:
        return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))
    
scoring_fnc = make_scorer(my_score)

In [13]:
train_and_eval(gbr, X_trainPoly, y_train, scoring_fnc)

{'model': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='ls', max_depth=5, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=400, n_iter_no_change=None, presort='auto',
              random_state=None, subsample=1.0, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False),
 'mean_train_score': 1.469543247660877,
 'mean_cv_score': 2.1878670019032027,
 'std_train_score': 0.03974074562636286,
 'std_cv_score': 0.1233845614224436}

In [35]:
train_and_eval(gbr, X_trainPoly[:,cols], y_train, scoring_fnc)

{'model': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='ls', max_depth=5, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=400, n_iter_no_change=None, presort='auto',
              random_state=None, subsample=1.0, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False),
 'mean_train_score': 1.7346179224467193,
 'mean_cv_score': 2.3093188619619056,
 'std_train_score': 0.03286869507301123,
 'std_cv_score': 0.10146460111778262}

In [34]:
X_trainPoly[:,cols]

(2400, 10)

In [39]:
train_and_eval(gbr, X_train, y_train, scoring_fnc)

{'model': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='ls', max_depth=5, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=400, n_iter_no_change=None, presort='auto',
              random_state=None, subsample=1.0, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False),
 'mean_train_score': 1.5902234888754823,
 'mean_cv_score': 2.11629460843733,
 'std_train_score': 0.024670699631280078,
 'std_cv_score': 0.2260639436159667}

In [None]:
data exploration categorical vs numeric