# Predictive Model 'hits'

### Importing the Libraries

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.externals import joblib
from collections import Counter
from bisect import bisect
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import webbrowser
import os

# Set charts to view inline
%matplotlib inline

### Reading the Data

In [11]:
df = pd.read_csv('ML_Data_Scientist_Case_Study_Data.csv', sep=';')

### Cleaning the Data

In [12]:
# Replacing the "\N" from the "hits" column to NaN
df = df.replace({r'\N': np.nan})

In [13]:
# Entry_Page, Session_Duration, Agent_id, Day_of_Week, Locale
df.entry_page = df.entry_page.astype(str)
# Spelling problem
df = df.rename(columns={'session_durantion': 'session_duration'})
df = pd.concat([df, pd.get_dummies(df.agent_id, prefix='agent')], axis=1)
df['day_of_week'] = df['day_of_week'].map({
   'Monday': 1, 
   'Tuesday': 2, 
   'Wednesday': 3, 
   'Thursday': 4, 
   'Friday': 5,
   'Saturday': 6, 
   'Sunday': 7
})
df['locale'] = df['locale'].map({
    'L1': 1,
    'L2': 2,
    'L3': 3,
    'L4': 4,
    'L5': 5,
    'L6': 6
})

In [14]:
# To_Numeric
df.session_duration = pd.to_numeric(df.session_duration)
df.hits = pd.to_numeric(df.hits)

In [15]:
df_train = df.loc[~df['hits'].isnull()]
df_test = df.loc[df['hits'].isnull()]

### Lets check the path_id

In [16]:
df_train = df_train.assign(len_path = lambda r: r.path_id_set.str.split(';').str.len())
df_train.len_path = df_train.len_path.fillna(1)
df_test = df_train.assign(len_path = lambda r: r.path_id_set.str.split(';').str.len())
df_test.len_path = df_test.len_path.fillna(1)

In [17]:
all_avengers = df.path_id_set.str.cat(sep=';')
path_id_size = len(set(all_avengers.split(';')))
path_id_freq = Counter(all_avengers.split(';'))

In [18]:
# df['path_id_set'] = df.path_id_set.str.replace(';', '').astype(float)
df_train.path_id_set = df_train.path_id_set.fillna('0')
df_test.path_id_set = df_test.path_id_set.fillna('0')

new_columns = ['path{}'.format(i) for i in range(8)]
thresh = [2,4,15,41,500,13000,100000]
values = [new_columns[bisect(thresh, v)] for v in path_id_freq.values()]
page_to_category = dict(zip(path_id_freq.keys(), values))

def infinity_war(x, col_name, dic):
    paths = x.split(';')
    ans = dict(zip(col_name, [0] * len(col_name)))    
    for f in paths:        
        ans[ dic[f] ] = 1
    return pd.Series(ans)

df_train = df_train.merge(df_train.path_id_set.apply(infinity_war, args=(new_columns, page_to_category)),
                  left_index=True, right_index=True)
df_test = df_test.merge(df_test.path_id_set.apply(infinity_war, args=(new_columns, page_to_category)),
                  left_index=True, right_index=True)

### Session_Duration

In [19]:
# Check the duration that a user has at the (mainly) Landing Page
median_session_duration_train  = df_train.groupby(['day_of_week', 'hour_of_day']).median()
df_train.session_duration = df_train.apply(lambda elp: median_session_duration_train.loc[elp.day_of_week, elp.hour_of_day]['session_duration'] 
                                           if np.isnan(elp.session_duration) else elp.session_duration, axis=1)
median_session_duration_test = df_test.groupby(['day_of_week', 'hour_of_day']).median()
df_test.session_duration = df_test.apply(lambda elp: median_session_duration_test.loc[elp.day_of_week, elp.hour_of_day]['session_duration'] 
                                           if np.isnan(elp.session_duration) else elp.session_duration, axis=1)

### It's time for the Entry_Page

In [20]:
entry_page_count = df.entry_page.value_counts()
sum(entry_page_count.head(20)) * 1./sum(entry_page_count)

0.9936784463340552

In [21]:
# I'm printing the Top-10
print(entry_page_count.head(10)[9])

3332


In [22]:
# Using the number we got from the Top-10
df_train.entry_page = df_train.apply(lambda elp: 'misc' if entry_page_count.loc[elp.entry_page] < 3332
 else elp.entry_page, axis=1)
df_test.entry_page = df_test.apply(lambda elp: 'misc' if entry_page_count.loc[elp.entry_page] < 3332
 else elp.entry_page, axis=1)

In [23]:
# Features_df
def get_ready_df(df_frame):
    features_df = df_frame.copy()
    features_df = pd.concat([features_df, pd.get_dummies(features_df.traffic_type)], axis=1)
    features_df = pd.concat([features_df, pd.get_dummies(features_df.entry_page)], axis=1)
    features_df = pd.concat([features_df, pd.get_dummies(features_df.hour_of_day)], axis=1)
    features_df = pd.concat([features_df, pd.get_dummies(features_df.day_of_week)], axis=1)
    features_df = features_df.drop(['row_num', 'day_of_week', 'locale', 'traffic_type', 'hour_of_day', 'entry_page', 'path_id_set'], axis=1)
    return features_df

### Split the data set in a training set (70%) and a test set (30%)

In [24]:
wolverin = get_ready_df(df_train)
X_train, X_test, y_train, y_test = train_test_split(wolverin.drop('hits', axis=1), 
                                                    wolverin.hits, test_size=0.3, random_state = 0)

### Create the Model

In [27]:
deadpool_2 = np.random.RandomState(69)
model = AdaBoostRegressor(
    DecisionTreeRegressor(
        max_depth=10), 
    loss ='square', 
    random_state = deadpool_2, 
    n_estimators = 20, 
    learning_rate = 0.5
)
model.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.5, loss='square', n_estimators=20,
         random_state=<mtrand.RandomState object at 0x00000236C432FF30>)

### Create the Model (alternative)
If you have time you can stop the above Model
and try the next 5 steps.
But be prepared for a      long     time

In [28]:
# model = ensemble.GradientBoostingRegressor()

### Parameters we want to try

In [29]:
# param_grid = {
#    'n_estimators': [500, 1000, 3000],
#    'max_depth': [4, 6],
#    'min_samples_leaf': [3, 5, 9, 17],
#    'learning_rate': [0.1, 0.05, 0.02, 0.01],
#    'max_features': [1.0, 0.3, 0.1],
#    'loss': ['ls', 'lad', 'huber']
#}

### Define the grid search we want to run. Run it with four cpus in parallel.

In [30]:
# gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

### Run the grid search - on only the training data!

In [31]:
# gs_cv.fit(X_train, y_train)

### Print the parameters that gave us the best result!

In [32]:
# print(gs_cv.best_params_)

### Save the trained model to a file so we can use it in other programs

In [33]:
# If we want to use it remote
joblib.dump(model, 'trained_hits_model.pkl')

['trained_hits_model.pkl']

### Mean_Squared_Error

In [34]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

32.816781657520359

### Results/Conclusion

In [35]:
df_test = get_ready_df(df_test)
# if you are working with the deadpool_2 model DON'T change anything
# If not, remove the '# 2' and put hashtag on the next line
df_test['hits'] = model.predict(df_test.drop(columns='hits'))
# 2 df_test['hits'] = gs_cv.predict(df_test.drop(columns='hits'))
df_test['hits'] = df_test['hits'].round()

### Export .csv

In [36]:
df_test['row_num'] = df.loc[df['hits'].isnull()].row_num
final_csv = pd.concat([df.loc[df['hits'].isnull()].row_num, df_test['hits']], axis=1)
final_csv.to_csv('submit_and_party.csv',index=False)

### View the Data in a web page

In [37]:
# Create a Web Page to view the Data easy
html = df[0:100].to_html()

### Save HTML

In [38]:
# Save the HTML to a temporary file
with open('data_new.html', 'w') as elp:
    elp.write(html)

### On our Web Browser

In [41]:
# Open the Web Page with our favorite Browser
full_file_name = os.path.abspath('data_new.html')
webbrowser.open('file://{}'.format(full_file_name))

True