## Import Packages

In [1]:
from matplotlib import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer, StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error, r2_score
import xgboost as xgb
from Helper_Functions import degree_search
from Helper_Functions import polynomial_search
from Helper_Functions import json_to_series
from joblib import dump, load

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.set_option('display.max_columns', 500)
%matplotlib inline

## Read Training Data

In [2]:
training = pd.read_csv("Data/train.csv")


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
training = training.sample(n=200000).reset_index(drop=True)

## Expand Json Columns

In [None]:
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource', 'adwordsClickInfo']

# training['device'].map(eval, false=False)




for col in json_cols:
    training[col] = training[col].astype(str)
    training[col] = training[col].str.replace('false', 'False').str.replace('true', 'True').map(eval)
    training = pd.concat([training.drop([col], axis=1), training[col].apply(pd.Series)], axis=1)
    
training.head(10)

## NA value recognition

In [None]:
for col in training.columns:
    training[col] = training[col].replace('(not set)', np.nan)

## Remove Unavailable Columns

In [None]:
cols_to_drop = [col for col in training.columns if 'not available in demo dataset' in list(training[col])]
print(cols_to_drop)

training = training[[col for col in training.columns if col not in cols_to_drop]]

training.head(10)

## Remove columns without information

In [None]:
# socialEngagementType has all the same values
# deviceCategory is perfectly multi-collinear with isMobile
# continent, highly multi-collinear with country and subContinent - could potentially run a multicolinearity test with VIF
# all visits are 1
# keyword, adContent: too sparse to be used
# isVideoAd: Collinear with adNetworkType
# targetingCriteria: no values
cols_to_drop2 = ['socialEngagementType', 'deviceCategory', 'continent', 'visits', 'isTrueDirect', 'keyword', 'adContent', 
                 'isVideoAd', 'targetingCriteria']
training = training[[col for col in training.columns if col not in cols_to_drop2]]

training.head(10)

## Remove irrelevant ID columns

In [None]:
cols_to_drop3 = ['sessionId', 'visitId', 'gclId']
training = training[[col for col in training.columns if col not in cols_to_drop3]]

training.head(10)

In [None]:
id_cols = ['fullVisitorId']

date_cols = ['date', 'visitStartTime']
# Note: sparsity on network domain is very high
cat_cols = ['channelGrouping', 'browser', 'operatingSystem', 'subContinent', 'country', 
            'networkDomain', 'referralPath', 'campaign', 'page', 'slot', 'adNetworkType', 'medium']
dummy_cols = ['isMobile']

# For hits: there is a need to group everything beyond 10
numeric_cols = ['pageviews', 'bounces', 'newVisits']

cols_with_sparsity_issues = ['visitNumber', 'hits', 'referralPath', 'campaign', 'source', 'page', 'slot', 'adNetworkType']

dependent_var = ['logTransactionRevenue']

## Log Transformation of DV

In [None]:
training['logTransactionRevenue'] = np.log(training['transactionRevenue'].astype(float))
training = training.drop('transactionRevenue', axis=1)
training

## Fill NA (only when NA's reasonably mean the same thing as 0's

In [None]:
training['bounces'] = training['bounces'].fillna(0)
training['newVisits'] = training['newVisits'].fillna(0)
training['logTransactionRevenue'] = training['logTransactionRevenue'].fillna(0)

## Sparse column value grouping into Other

In [None]:
for col in list(set(cols_with_sparsity_issues+cat_cols)):
    s = training[col].value_counts()
    training[col] = np.where(training[col].isin(s.index[s < 200]), 'Other', training[col])

## Datetime Object Breakdown

In [None]:
# I have to figure out what timestamp format is the visitStartTime using, currently with the standard 10 digit timestamp it just maps
# to 1970's

training["weekday"] = training.date.apply(lambda dateString : calendar.day_name[datetime.strptime(str(dateString),"%Y%m%d").weekday()])
training["month"] = training.date.apply(lambda dateString : calendar.month_name[datetime.strptime(str(dateString),"%Y%m%d").month])
training["year"] = training.date.apply(lambda dateString : str(datetime.strptime(str(dateString),"%Y%m%d").year))
training = training.drop('date',axis=1)
# training["datetime"] = training.date.apply(lambda dateString : str(datetime.fromtimestamp(dateString)))
# training["hour"] = training.datetime.apply(lambda x : x.split()[1].split(":")[0])

## Missing Value Analysis

In [None]:
msno.matrix(training,figsize=(12,5))

## Outlier Removal

In [None]:
# Remove data greater than 3 std away from mean

print("before outlier removal, " + str(training.shape[0]) + " rows")

for col in numeric_cols:
    training[col] = training[col].astype(float)
    training = training[np.abs(training[col]-training[col].mean())<=(3*training[col].std())] 
    
print("after outlier removal, " + str(training.shape[0]) + " rows")

## Get Dummies & Bins

In [None]:
pd.get_dummies(training[cat_cols+['weekday','month','year']], drop_first=True)

In [None]:
training = training.join(pd.get_dummies(training[cat_cols+['weekday','month','year']], drop_first=True))

In [None]:
training = training[[col for col in training.columns if col not in list(set(cols_with_sparsity_issues+cat_cols+['weekday','month','year']))]]
training

## Other individual column adjustments

In [None]:
training['isMobile'] = training['isMobile'].astype(int)
training = training.drop('visitStartTime', axis=1)
training

## Numeric Feature Scaling - especially for SVM

In [None]:
# scaler = StandardScaler()
# training['humidity'] = scaler.fit_transform(training[['humidity']])
# training['atemp'] = scaler.fit_transform(training[['atemp']])

## Correlation Map

In [None]:
corrMatt = training[[col for col in training if col!='fullVisitorId']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(50,20)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

## Degree Search - uneeded as we are using gradient boosting

In [None]:
# df, degree_dict = degree_search(df=training[['atemp', 'temp', 'windspeed', 'humidity','count']], dep='count')

## Dependent & Independent Split

In [None]:
X = training[[col for col in training.columns if col not in dependent_var and col not in id_cols]]
y = training[dependent_var[0]]

## Polynomial Degree Search - uneeded as we are using gradient boosting

In [None]:
# poly_dict = polynomial_search(X, y, highest_degree=2)
# poly_dict

## Generate Polynomial Features for Numeric Variables  - uneeded as we are using gradient boosting

In [None]:
# poly = PolynomialFeatures(1)
# X = poly.fit_transform(X)

# X.shape

## Feature Selection - Variance Thresholding

In [None]:
sel = VarianceThreshold(threshold=(.91 * (1 - .91)))
X = sel.fit_transform(X)

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=1)

## Linear Regression

In [None]:
lreg = LinearRegression()
lreg.fit(X_train, y_train)

lreg.score(X_test, y_test)

predictions = lreg.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=lreg.score(X_test, y_test), 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Support Vector Regression

In [None]:
svreg = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
svreg.fit(X_train, y_train)

svreg.score(X_test, y_test)

predictions = svreg.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=svreg.score(X_test, y_test), 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Random Forest Regressor

In [None]:
rfreg = RandomForestRegressor(max_depth=2, random_state=0)
rfreg.fit(X_train, y_train)

rfreg.score(X_test, y_test)

predictions = rfreg.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=rfreg.score(X_test, y_test), 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Hyperparameter Tuning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Training with random_grid

rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)


ar = rf_random.score(X_test, y_test)

predictions = rf_random.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=ar, 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Gradient Boosting Regressor

In [None]:
gbreg = GradientBoostingRegressor(random_state=0)
gbreg.fit(X_train, y_train)

gbreg.score(X_test, y_test)

predictions = gbreg.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=gbreg.score(X_test, y_test), 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Hyperparameter Tuning

In [None]:
# Number of trees in gbm
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Training with random_grid

gb = GradientBoostingRegressor(random_state=0)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
gb_random.fit(X_train, y_train)


ar = gb_random.score(X_test, y_test)

predictions = gb_random.predict(X_test)



txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=ar, 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## XGBoost

In [None]:
train_data = xgb.DMatrix(X_train, y_train)
test_data = xgb.DMatrix(X_test, y_test)

params = {"booster": 'gbtree',
              "objective" : "reg:squaredlogerror",
              "eval_metric" : "auc", 
              "is_unbalance": True,
              "n_estimators": 500,
              "max_depth" : 5,
              "reg_alpha" : 0.01,
              "reg_lambda" : 0.01,
              "gamma": 5,
              "num_threads" : 20,
              "min_child_weight" : 5,
              "learning_rate" : 0.01,
              "subsample_freq" : 5,
              "seed" : 42,
              "verbosity" : 0,
              "num_boost_round": 500}



cv_result = xgb.cv(params,
                   train_data,
                   1000,
                   early_stopping_rounds=100,
                   stratified=True,
                   nfold=3)

xgboost = xgb.train(params, train_data, 10000)

predictions = xgboost.predict(test_data)

ar = r2_score(y_test,np.maximum(predictions, 0))




txt1 = "Adjusted R squared: {AR}; \nRMSLE: {rmsle}".format(AR=ar, 
                                                            rmsle=np.sqrt(mean_squared_log_error(y_test, np.maximum(predictions, 0))))
print(txt1)

## Save Best Model

In [None]:
dump(gbreg, 'Models/GradientBoosting.joblib') 

## Generate Kaggle Predictions

In [None]:
test_data = pd.read_csv('Data/test.csv')



test_data["date"] = test_data.datetime.apply(lambda x : x.split()[0])
test_data["hour"] = test_data.datetime.apply(lambda x : x.split()[1].split(":")[0])
test_data["weekday"] = test_data.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
test_data["month"] = test_data.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])


# Mapping to strings
test_data["season"] = test_data.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })
test_data["weather"] = test_data.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })





# Mapping to categories

categoryVariables = ["season","weather", "weekday"]
textCategories = ["season","weather", "weekday"]
alreadyCategories = ["holiday", "workingday"]
binVariables = ["hour"]


for var in categoryVariables:
    test_data[var] = test_data[var].astype("category")
    
    
    
# Unrepeatable
bins = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='uniform')
test_data[binVariables] = bins.fit_transform(test_data[binVariables])
test_data[binVariables] = test_data[binVariables].astype("category")


dummie_cols = pd.get_dummies(test_data[textCategories+binVariables]).columns
test_data = test_data.join(pd.get_dummies(test_data[textCategories+binVariables]))

X = test_data[numericVariables+list(dummie_cols)+alreadyCategories]


X = poly.transform(X)
X = sel.transform(X)

test_data['count'] = rf_random.predict(X)

test_data.to_csv('Data/test_labelled.csv')

In [None]:
test_data[['datetime', 'count']].to_csv('Data/test_submission.csv', index=False)

In [None]:
test_data[['datetime', 'count']]