In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer

In [36]:
import warnings
warnings.filterwarnings("ignore")

In [37]:
data = pd.read_csv('../data/processed_dataset.csv')

In [38]:
data.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'runtime', 'spoken_languages', 'status', 'tagline',
       'title', 'Keywords', 'cast', 'crew', 'revenue', 'collection_name',
       'has_collection', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western', 'is_english', 'number_of_languages',
       'keyword_text', 'number_of_keywords', 'cast_size', 'crew_size',
       'has_homepage', 'budget_log', 'popularity_log', 'release_date_year',
       'release_date_weekday', 'release_date_month', 'release_date_weekofyear',
       'release_date_day', 'release_date_quarter'],
      dtype='object')

In [39]:
data = data[['budget','runtime','has_collection', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western', 'is_english', 'number_of_languages', 'number_of_keywords', 'cast_size', 'crew_size',
       'has_homepage', 'budget_log', 'popularity_log','revenue']]

In [40]:
data['has_collection'] = data['has_collection'].fillna(0)

In [14]:
data  = data[data['budget'] > 0]

### And we need to scale the value for 
    - budget
    - runtime
    - number_of_languages
    - number_of_keywords
    - cast_size 
    - crew_size
    - budget_log 
    - popularity_log
    and other values are binary values so, binary feature we don't need to scale

In [41]:
non_binary_feature = ['budget','runtime', 'number_of_keywords',
                      'cast_size', 'crew_size','has_homepage', 'budget_log', 'popularity_log']
binary_feature = ['number_of_languages','has_collection', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
                  'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
                  'Thriller', 'War', 'Western', 'is_english']

In [42]:
feature_1 = data[non_binary_feature].values
feature_2 = data[binary_feature].values

In [43]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [44]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import GridSearchCV

In [45]:
feature = np.concatenate((feature_1,feature_2),axis=1)
label = np.log1p(data.revenue.values)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(feature, label,random_state=24)

### Updated_feature

In [22]:
linear = LinearRegression()
linear.fit(X_train,y_train)

pred_linner = linear.predict(X_test)
print(pred_linner[:10])

[17.2776351  18.73135364 21.19109817 19.8370173  15.78764338 17.88428864
 18.01789614 18.54923087 16.76439857 18.19831119]


In [23]:
mean_squared_error(y_test, pred_linner)

3.451557704210658

In [24]:
mean_squared_log_error(y_test,pred_linner)

0.030942555390532304


## linear model

In [232]:
linear = LinearRegression()
linear.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [233]:
pred_linner = linear.predict(X_test)
print(pred_linner[:10])

[15.35279163 14.85232632 13.4544283  17.49679307 15.1431466  22.59744295
 16.91196859 13.18847791 17.02895059 18.44335373]


In [234]:
#here pred values are negative so we can't use mean_squared_log_error, we are using mean_squared_error

In [235]:
mean_squared_error(y_test, pred_linner)

5.207282162232285

In [236]:
mean_squared_log_error(y_test,pred_linner)

0.04344660960734203

### svm model

In [237]:
from sklearn.model_selection import GridSearchCV

In [259]:
model = SVR(C=10,kernel='linear')
parameters = {'kernel':['linear', 'poly', 'rbf'],
             'degree':[2,3],
             'C':[1,10,20]}

In [260]:
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)

In [None]:
model.fit(X_train,y_train)
pred_svm = model.predict(X_test)

In [None]:
msle = mean_squared_log_error(y_test,pred_svm)
mse = mean_squared_error(y_test,pred_svm)
print(f"MSLE is : {msle}\nMSE is : {mse}")

### random forest

### updated_feature

In [28]:
model = RandomForestRegressor()
parameters = {'n_estimators':[10,20,30,40,50],
              'min_samples_leaf' : [5, 8,10],
              'max_depth' :[2,3,4,5]
         }
rf= GridSearchCV(model,parameters)

In [29]:
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)

In [30]:
msle = mean_squared_log_error(y_test,pred_rf)
mse = mean_squared_error(y_test,pred_rf)
print(f"MSLE is : {msle}\nMSE is : {mse}")

MSLE is : 0.03212433660681026
MSE is : 3.598188776533107


In [246]:
model = RandomForestRegressor()
parameters = {'n_estimators':[10,20,30,40,50],
              'min_samples_leaf' : [5, 8,10],
              'max_depth' :[2,3,4,5]
         }
rf= GridSearchCV(model,parameters)

In [247]:
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)

In [248]:
msle = mean_squared_log_error(y_test,pred_rf)
mse = mean_squared_error(y_test,pred_rf)
print(f"MSLE is : {msle}\nMSE is : {mse}")

MSLE is : 0.04124708459114953
MSE is : 4.852466563970675


In [31]:
msle = mean_squared_log_error(y_test,pred_rf)
mse = mean_squared_error(y_test,pred_rf)
print(f"MSLE is : {msle}\nMSE is : {mse}")

MSLE is : 0.03212433660681026
MSE is : 3.598188776533107


In [32]:
# tea pot

In [33]:
from tpot import TPOTRegressor

In [34]:
regressor_config_dict = {

    'sklearn.ensemble.ExtraTreesRegressor': {
        'n_estimators': [100],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100],
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05),
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
    },

    'sklearn.ensemble.AdaBoostRegressor': {
        'n_estimators': [100],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'loss': ["linear", "square", "exponential"]
    },

    'sklearn.tree.DecisionTreeRegressor': {
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },

    'sklearn.neighbors.KNeighborsRegressor': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },

    'sklearn.linear_model.LassoLarsCV': {
        'normalize': [True, False]
    },

    'sklearn.svm.LinearSVR': {
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1.]
    },

    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    'xgboost.XGBRegressor': {
        'n_estimators': [100],
        'max_depth': range(1, 11),
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'subsample': np.arange(0.05, 1.01, 0.05),
        'min_child_weight': range(1, 21),
        'nthread': [1],
        'objective': ['reg:squarederror']
    }
}


In [49]:
model = TPOTRegressor(generations=50, population_size=20, config_dict= regressor_config_dict)

In [50]:
model.fit(X_train,y_train)
pred = model.predict(X_test)

msle = mean_squared_log_error(y_test,pred)
mse = mean_squared_error(y_test,pred)
print(f"MSLE is : {msle}\nMSE is : {mse}")

MSLE is : 0.03901428074676565
MSE is : 4.61548006804557


In [51]:
model.export('test_logirized_2.py')

In [120]:
def prepare(df):
    global json_cols
    global train_dict

    df[['release_month','release_day','release_year']]=df['release_date'].str.split('/',expand=True).replace(np.nan, 0).astype(int)
    df['release_year'] = df['release_year']
    df.loc[ (df['release_year'] <= 19) & (df['release_year'] < 100), "release_year"] += 2000
    df.loc[ (df['release_year'] > 19)  & (df['release_year'] < 100), "release_year"] += 1900
    
    releaseDate = pd.to_datetime(df['release_date']) 
    df['release_dayofweek'] = releaseDate.dt.dayofweek 
    df['release_quarter'] = releaseDate.dt.quarter     
    
    df['originalBudget'] = df['budget']
    df['inflationBudget'] = df['budget'] + df['budget']*1.8/100*(2018-df['release_year']) #Inflation simple formula
    df['budget'] = np.log1p(df['budget']) 
    
    
    # Thanks to this Kernel for the next 7 features https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation
    df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    df['belongs_to_collection'] = [literal_eval(i) if not pd.isnull(i) else i for i in df['belongs_to_collection'] ]
    
    #df['_collection_name'] = df['belongs_to_collection'].apply(lambda x: x[0]['name'] if not pd.isnull(x) else x)
    df['_collection_name'] = [i[0]['name'] if not pd.isnull(i) else i for i in df['belongs_to_collection']]
    le = LabelEncoder()
    le.fit(list(df['_collection_name'].fillna('')))
    df['_collection_name'] = le.transform(df['_collection_name'].fillna('').astype(str))
    df['_num_Keywords'] = df['Keywords'].apply(lambda x: len(x) if x != {} else 0)
    df['_num_cast'] = df['cast'].apply(lambda x: len(x) if x != {} else 0)

    
    
    df['_popularity_mean_year'] = df['popularity'] / df.groupby("release_year")["popularity"].transform('mean')
    df['_budget_runtime_ratio'] = df['budget']/df['runtime'] 
    df['_budget_popularity_ratio'] = df['budget']/df['popularity']
    df['_budget_year_ratio'] = df['budget']/(df['release_year']*df['release_year'])
    df['_releaseYear_popularity_ratio'] = df['release_year']/df['popularity']
    df['_releaseYear_popularity_ratio2'] = df['popularity']/df['release_year']

    #df['_popularity_totalVotes_ratio'] = df['totalVotes']/df['popularity']
    #df['_rating_popularity_ratio'] = df['rating']/df['popularity']
    #df['_rating_totalVotes_ratio'] = df['totalVotes']/df['rating']
    #df['_totalVotes_releaseYear_ratio'] = df['totalVotes']/df['release_year']
    #df['_budget_rating_ratio'] = df['budget']/df['rating']
    #df['_runtime_rating_ratio'] = df['runtime']/df['rating']
    #df['_budget_totalVotes_ratio'] = df['budget']/df['totalVotes']
    
    df['has_homepage'] = 1
    df.loc[pd.isnull(df['homepage']) ,"has_homepage"] = 0
    
    df['isbelongs_to_collectionNA'] = 0
    df.loc[pd.isnull(df['belongs_to_collection']) ,"isbelongs_to_collectionNA"] = 1
    
    df['isTaglineNA'] = 0
    df.loc[df['tagline'] == 0 ,"isTaglineNA"] = 1 

    df['isOriginalLanguageEng'] = 0 
    df.loc[ df['original_language'] == "en" ,"isOriginalLanguageEng"] = 1
    
    df['isTitleDifferent'] = 1
    df.loc[ df['original_title'] == df['title'] ,"isTitleDifferent"] = 0 

    df['isMovieReleased'] = 1
    df.loc[ df['status'] != "Released" ,"isMovieReleased"] = 0 

    # get collection id
    #df['collection_id'] = df['belongs_to_collection'].apply(lambda x : np.nan if len(x)==0 else x[0]['id'])
    df['collection_id'] = [i[0]['id'] if not pd.isnull(i) else i for i in df['belongs_to_collection']]
    df['original_title_letter_count'] = df['original_title'].str.len() 
    df['original_title_word_count'] = df['original_title'].str.split().str.len() 


    df['title_word_count'] = df['title'].str.split().str.len()
    df['overview_word_count'] = df['overview'].str.split().str.len()
    df['tagline_word_count'] = df['tagline'].str.split().str.len()
    
    df['production_countries_count'] = df['production_countries'].apply(lambda x : len(x))
    df['production_companies_count'] = df['production_companies'].apply(lambda x : len(x))
    df['cast_count'] = df['cast'].apply(lambda x : len(x))
    df['crew_count'] = df['crew'].apply(lambda x : len(x))
    

    df['meanPopularityByYear'] = df.groupby("release_year")["popularity"].aggregate('mean')
    df['meanBudgetByYear'] = df.groupby("release_year")["budget"].aggregate('mean')
    df['medianBudgetByYear'] = df.groupby("release_year")["budget"].aggregate('median')

    for col in ['genres', 'production_countries', 'spoken_languages', 'production_companies'] :
        df[col] = df[col].map(lambda x: sorted(list(set([n if n in train_dict[col] else col+'_etc' for n in [d['name'] for d in x]])))).map(lambda x: ','.join(map(str, x)))
        temp = df[col].str.get_dummies(sep=',')
        df = pd.concat([df, temp], axis=1, sort=False)
    df.drop(['genres_etc'], axis = 1, inplace = True)
    
    df = df.drop(['id','belongs_to_collection','genres','homepage','imdb_id','overview','runtime'
    ,'poster_path','production_companies','production_countries','release_date','spoken_languages'
    ,'status','title','Keywords','cast','crew','original_language','original_title','tagline', 'collection_id'
    ],axis=1)
    
    df.fillna(value=0.0, inplace = True) 

    return df

In [121]:
json_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']


In [122]:
from utility.visualization import dealing_null_value

data = pd.read_csv('../data/ML 1.csv')
train = data.iloc[:2100]
test = data.iloc[2100:]

for i in json_cols:
    train = dealing_null_value(train,i)
    test = dealing_null_value(test,i)

In [123]:


def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d
    
def get_json_dict(df) :
    global json_cols
    result = dict()
    for e_col in json_cols :
        d = dict()
        rows = df[e_col].values
        for row in rows :
            if row is None : continue
            for i in row :
                if i['name'] not in d :
                    d[i['name']] = 0
                d[i['name']] += 1
        result[e_col] = d
    return result

train_dict = get_json_dict(train)
test_dict = get_json_dict(test)

# remove cateogry with bias and low frequency
for col in json_cols :
    
    remove = []
    train_id = set(list(train_dict[col].keys()))
    test_id = set(list(test_dict[col].keys()))   
    
    remove += list(train_id - test_id) + list(test_id - train_id)
    for i in train_id.union(test_id) - set(remove) :
        if train_dict[col][i] < 10 or i == '' :
            remove += [i]
            
    for i in remove :
        if i in train_dict[col] :
            del train_dict[col][i]
        if i in test_dict[col] :
            del test_dict[col][i]
            


In [124]:
from tqdm import tqdm
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder

In [125]:
all_data = prepare(pd.concat([train, test]).reset_index(drop = True))

In [149]:
all_data = all_data.fillna(0)
all_data = all_data.astype('float32')

In [150]:
feature_list = [i for i in (list(all_data.columns)) if i != 'revenue']

In [164]:
all_data['revenue_updated'] = np.log1p(all_data['revenue'].values)

In [166]:
X_train, X_test, y_train, y_test = train_test_split(all_data[feature_list], all_data[['revenue_updated']])

In [175]:
linner = xgb.XGBRegressor()
parameters = {'max_depth':[2,3,4,5,6], 
              'learning_rate':[0.1, 0.2, 0.15],
              'n_estimators':[10,15,18,20,25,30]}
clf_xgb = GridSearchCV(linner, parameters)

In [176]:
clf_xgb.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.1, 0.2, 0.15], 'n_estimators': [10, 15, 18, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [178]:
pred_value = clf_xgb.predict(X_test)

In [179]:
mean_squared_error(y_test, pred_value)

3.8243978

In [180]:
mean_squared_log_error(y_test, pred_value)

0.029023834

In [185]:
clf_xgb.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.2, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=30, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [186]:
xgb_r = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.2, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=30, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [187]:
xgb_r.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.2, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=30, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [192]:
important_columns = {}
for col, score in zip(X_train.columns, xgb_r.feature_importances_):
    if score > 0.0:
        important_columns[col] = score

In [196]:
data = sorted(important_columns.items(), key = lambda x : x[1])

In [215]:
data = dict([('_num_cast', 0.018029619),
 ('budget', 0.019826835),
 ('_collection_name', 0.022449156),
 ('release_year', 0.025995579),
 ('genders_2_crew', 0.04345532),
 ('_releaseYear_popularity_ratio', 0.058784723),
 ('_popularity_mean_year', 0.07657793),
 ('popularity', 0.11947978),
 ('_budget_year_ratio', 0.13055435),
 ('inflationBudget', 0.2958483)])

In [216]:
important_feature= list(data.keys())

In [217]:
linner = xgb.XGBRegressor()
parameters = {'max_depth':[2,3,4,5,6], 
              'learning_rate':[0.1, 0.2, 0.15],
              'n_estimators':[10,15,18,20,25,30]}
clf_xgb1 = GridSearchCV(linner, parameters)
clf_xgb1.fit(X_train[important_feature], y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.1, 0.2, 0.15], 'n_estimators': [10, 15, 18, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [218]:
pred = clf_xgb1.predict(X_test[important_feature])

In [219]:
mean_squared_error(y_test, pred)

3.7610962

In [220]:
mean_squared_log_error(y_test, pred)

0.0286957