# Ensemble methods for Classification Modeling of Click-Stream data

##### Import the BigQuery library for BigQuery SQL functions

In [1]:
import gcp.bigquery as bq

##### Create a SQL module called bq_omn_requests to access the data

In [2]:
%%sql --module bq_table
select * 
from ClickADS2.ADS_ensemble1
##where order = 1
order by post_visid, datetime


##### Explore the result set - view some collection of records

In [3]:
%%bigquery sample --count 10 --query bq_table

post_visid,VISIT_PAGE_NUM,DATETIME,order,browse_plp_cnt,browse_scat_cnt,int_srch_pip_cnt,pd_cmpgn_content_cnt,pip_cnt,plp_cnt,content,spc_buy,acct_sgn_in,thumbnail_vw_cnt,refind_srch_cnt,search_cnt,appliances,segment_1,segment_2,segment_3,segment_4,segment_5,segment_6,segment_7,segment_8,segment_9,segment_10,segment_1_duration,segment_2_duration,segment_3_duration,segment_4_duration,segment_5_duration,segment_6_duration,segment_7_duration,segment_8_duration,segment_9_duration,segment_10_duration,visit_duration,segment_path,page_view_cnt
115942699612058706125806576234,25,01Dec2015 10:57:54.000,0,5,0,0,0,5,5,0,0,0,2,0,0,0,pip,,plp,pip,plp,,pip,plp,pip,,0.55,0.22,0.32,0.63,2.17,0.85,0.4,0.12,2.25,0.0,10.08,pip_ _plp_pip_plp_ _pip_plp_pip_,25
115943730511915251440592659287,11,01Dec2015 19:39:39.000,0,0,0,0,0,3,0,0,0,0,1,0,0,0,pip,,,,,,,,,,2.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.18,pip_ _ _ _ _ _ _ _ _,11
115944770271694836181020243372,1,01Dec2015 16:54:10.000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,pip,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pip_ _ _ _ _ _ _ _ _,1
115944791447135714035661097526,5,01Dec2015 10:18:13.000,0,0,0,0,0,1,0,0,0,0,0,0,1,0,homepage,,search results,pip,,,,,,,5.52,0.05,1.37,2.43,0.0,0.0,0.0,0.0,0.0,0.0,9.37,homepage_ _search results_pip_ _ _ _ _ _,5
115944857671556848147187079880,1,01Dec2015 17:24:50.000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,plp,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,plp_ _ _ _ _ _ _ _ _,1
115947904789336752873600801594,1,01Dec2015 19:28:10.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,homepage,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,homepage_ _ _ _ _ _ _ _ _,1
115953721947431862839367797473,42,01Dec2015 23:08:02.000,0,6,1,0,0,4,7,1,0,0,0,0,7,2,homepage,content,,search results,,search results,,pip,,search results,0.12,0.9,1.3,1.02,0.28,0.02,3.2,0.62,1.42,0.53,22.78,homepage_content_ _search results_ _search results_ _pip_ _search results,42
115954432248227738911320062156,1,01Dec2015 15:17:54.000,0,0,0,0,0,0,0,3,0,0,0,0,0,0,content,,content,,,,,,,,2.77,0.5,0.12,87.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,content_ _content_ _ _ _ _ _ _,1
115955382771450620354140327513,16,01Dec2015 18:07:00.000,0,0,0,1,0,6,0,0,0,0,1,0,0,0,pip,,pip,,pip,,,,,,10.8,6.47,1.2,6.0,3.93,2.28,0.0,0.0,0.0,0.0,30.68,pip_ _pip_ _pip_ _ _ _ _,16
115958973171644213891269919966,1,01Dec2015 10:29:21.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,homepage,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,homepage_ _ _ _ _ _ _ _ _,1


##### The code below constructs a BigQuery Query instance, executes the query, and converts the results into a DataFrame.  
##### The len statement counts the number of records

In [4]:
df = bq.Query(bq_table).to_dataframe()
len(df)

1600

##### Examine or Describe records from the DataFrame created above

In [5]:
##df.head(10)
#df.describe(include = 'all')
df['order'].value_counts()

0    819
1    781
Name: order, dtype: int64

##### -----------------------------------------------------------------------------------------------------------------
#####      Start the processing of the various ensemble methods                                                
##### -----------------------------------------------------------------------------------------------------------------

##### Install the SQL package (does not come pre-installed)

In [56]:
%%bash
##pip install pandasql  ## Already installed
pip install graphviz
pip install pydot    ## Already installed

Cleaning up...
Downloading/unpacking pydot
  Downloading pydot-1.0.28.tar.gz
  Running setup.py (path:/tmp/pip-build-Xz1R_K/pydot/setup.py) egg_info for package pydot
    Couldn't import dot_parser, loading of dot files will not be possible.
    
Installing collected packages: pydot
  Running setup.py install for pydot
    Couldn't import dot_parser, loading of dot files will not be possible.
    
Successfully installed pydot
Cleaning up...


##### Import some additional libraries

In [57]:
import numpy as np
import pandas as pd
from pandasql import PandaSQL
from patsy import dmatrices
import statsmodels.api as sm
#from sklearn.model_selection import cross_val_score    ## This is a module found in sklearn v18. We are running v16
from sklearn.cross_validation import cross_val_score    ## Same functionality but in v16
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.tree import DecisionTreeClassifier  
from sklearn import tree        # Can call DecisionTreeClassifier from tree.DecisionTreeClassifier
from sklearn import metrics
from sklearn.externals.six import StringIO
import graphviz as gv
import pydot

##### Check the SciKit Learn version

In [8]:
#import sklearn as sklearn
#print(sklearn, sklearn.__version__)

##### Create the data structure

In [9]:
y, X = dmatrices('order ~ browse_scat_cnt + browse_plp_cnt + page_view_cnt + pd_cmpgn_content_cnt \
 + pip_cnt + plp_cnt + spc_buy + acct_sgn_in + thumbnail_vw_cnt + refind_srch_cnt + int_srch_pip_cnt + content  \
 + search_cnt + appliances + visit_duration' ,df, return_type="dataframe")

##### Display the predictors in the functional form

In [30]:
##print(X.columns)
type(X)

pandas.core.frame.DataFrame

##### Structure the Dependent Variable into an arrary

In [31]:
y = np.ravel(y)
#y
type(y)

numpy.ndarray

##### Create the Classifer object (i.e., clf)  using Decision Tree

In [15]:
clf = tree.DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()                      

0.8287483047691323

In [16]:
scores

array([ 0.83146067,  0.81988743,  0.83489681])

In [17]:
clf_tree = clf.fit(X, y)

##### Some learning.  Note that the "bunch" data structure is just a dictionary with some syntactical sugar on top. It's 
#####  not a readily available object.

In [32]:
#from sklearn.datasets import load_iris
#from sklearn import tree
#iris = load_iris()
##type(iris)   ## This is a sklearn.datasets.base.Bunch
##iris         ## Very rich data structure


In [36]:
DescDict = {"feature_names": ["browse_scat_cnt","browse_plp_cnt","page_view_cnt","pd_cmpgn_content_cnt",
  "pip_cnt","plp_cnt","spc_buy","acct_sgn_in","thumbnail_vw_cnt","refind_srch_cnt","int_srch_pip_cnt","content", 
  "search_cnt","appliances","visit_duration"], "target_names": ["0","1"], "data": [[]], "description": "some description"}

##### This is to render inline

In [60]:
from IPython.display import Image  
dot_data = StringIO()  
tree.export_graphviz(clf_tree, out_file=dot_data)  
                         #feature_names=DescDict['feature_names'],  
                        # class_names=DescDict['target_names'],  
                         #filled=True, rounded=True,  
                         #special_characters=True)  
##graph = pydot.graph_from_dot_data(dot_data.getvalue())  
##graph.write_pdf("clf_tree.pdf") 

pydot.graph_from_dot_data(dot_data.getvalue()).write_pdf("clf_tree2.pdf")  ## Same error as the prev 2 statements

#type(graph)
#vars(graph)   ## get the attributes of an object "write_pdf" is an attribut in a long list
##vars(gv)
#print dot_data.getvalue()
#Image(graph.create_png())  

InvocationException: GraphViz's executables not found

##### This is to render into a PDF or other file format for local viewing

In [None]:
from sklearn.externals.six import StringIO  
>>> import pydot 
>>> dot_data = StringIO() 
>>> tree.export_graphviz(clf, out_file=dot_data) 
>>> graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
>>> graph.write_pdf("iris.pdf") 

##### Create the Classifer object (i.e., clf) using Random Forest 

In [65]:
## I increased the n_estimators from 10 to 30 and min_samples_split from 1 to 5 and got a couple hundreths of 
##  improvement
clf = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=5, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()    

0.8706237278448844

In [55]:
clf = ExtraTreesClassifier(n_estimators=30, max_depth=None, min_samples_split=5, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()   ### > 0.999  this is only a test that will display True or False

0.86250301569567123

In [66]:
clf = clf.fit(X, y)

In [67]:
probs = clf.predict_proba(X)

In [68]:
print(metrics.roc_auc_score(y, probs[:, 1]))

0.998471794246


In [69]:
predicted = clf.predict(X)
predicted

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [70]:
print(metrics.confusion_matrix(y, predicted))

[[794  25]
 [ 13 768]]


In [76]:
importances = clf.feature_importances_
importances

array([ 0.        ,  0.01379033,  0.02441829,  0.32694096,  0.00711811,
        0.08864295,  0.03533441,  0.0038637 ,  0.11158693,  0.02986293,
        0.02192076,  0.01593716,  0.02695996,  0.05461138,  0.01443287,
        0.22457926])

In [81]:
pd.DataFrame(list(zip(X.columns, np.transpose(importances))))    ## adds to 1
#params = clf.get_params(deep=True)   ## This is a list of the parameters for the classifier function
#params

Unnamed: 0,0,1
0,Intercept,0.0
1,browse_scat_cnt,0.01379
2,browse_plp_cnt,0.024418
3,page_view_cnt,0.326941
4,pd_cmpgn_content_cnt,0.007118
5,pip_cnt,0.088643
6,plp_cnt,0.035334
7,spc_buy,0.003864
8,acct_sgn_in,0.111587
9,thumbnail_vw_cnt,0.029863


### <u>Output Options (In Development)</u>
#####  <hr>  </hr> 

In [None]:
import gcp
import gcp.storage as storage
import gcp.bigquery as bq
import pandas as pd

##### Create a BigQuery table for the ADS (this works) 

In [None]:
ads = bq.DataSet('ClickADS2')  # First, create the dataset.... this is not the table !
ads.create(friendly_name = 'ClickStream ADS', description = 'ADS created from Sample Omniture data')
ads.exists()

bigquery_dataset_name = 'ClickADS2'
bigquery_table_name = 'ADS_Logit1'

# Define BigQuery dataset and table
dataset = bq.DataSet(bigquery_dataset_name)
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

# Create or overwrite the existing table if it exists
table_schema = bq.Schema.from_dataframe(out)
table.create(schema = table_schema, overwrite = True)

# Write the DataFrame to a BigQuery table
table.insert_data(out)

In [None]:
print(table_schema)

-------------------------------
### In Development
-------------------------------

### Different write functions into the VM Files System or GCS for audit and/or persistent storage 

##### Create a bucket in GCS and either write from the Python DataFrame or write to this bucket from the VM (see below)
#####  (This works)

In [None]:
import gcp
import gcp.storage as storage
from StringIO import StringIO

In [None]:
project = gcp.Context.default().project_id   # correct Project ID is found
bucket_name = 'steve-temp2'           ## .... or can create a new bucket using the bucket.create() below
bucket_path  = 'gs://' + bucket_name   
bucket_object = bucket_path + '/out.csv'
#bucket_object = bucket_path + '/out2.csv'
bucket = storage.Bucket(bucket_name)

# Create the bucket if it doesn't exist
if not bucket.exists():
  bucket.create()

bucket  ## This command will display the bucket name  
# The name of the bucket is:  gs://steve-temp2

bucket.exists()
# Confirmed created in GCS both with a visual inspection and a "True" result from the this function
#bucket_path
#bucket_object

##### Use the line command storage magic to write to GCS

In [None]:
%storage write --variable out --object $bucket_object

##### Write out the ADS dataframe above to the VM file system ( This worked )

In [None]:
out.to_csv("ads_out2.csv", encoding='utf-8', columns=out.columns.values.tolist()) 

In [None]:
# Write the file to the storage bucket
#file = bucket.item('ads_out2.csv')
file
##file.write_to(bucket)

##### This shells out to the VM and executes the gsutil ( This works )

In [None]:
%%bash
##gsutil cp -r /content/steven_einbender@homedepot.com gs://steve-temp2
gsutil cp /content/steven_einbender@homedepot.com/ads_out2.csv  gs://steve-temp

##### Read from GC Storage and create a Python DataFrame (This works and retains the schema from the .to_csv above)

In [None]:
gcs_ads_in = storage.Item('steve-temp','ads_out2.csv').read_from()
#The following will just display the file contents as a continuous string. Str is the object type
#gcs_ads_in
ads_df = pd.read_csv(StringIO(gcs_ads_in))
#type(ads_df)  #This is now a DataFrame
#ads_df

##### Read from GC Storage and create a Python DataFrame ( This works too but was schema-less from the %storage write above )

In [None]:
gcs_ads2_in = storage.Item('steve-temp2','out.csv').read_from()
ads2_df = pd.read_csv(StringIO(gcs_ads2_in))
#type(ads2_df)  #This is now a DataFrame
#ads2_df

In [None]:
#ads2_df.ix[:3,['browse_plp_cnt']]
#pdsql("SELECT * FROM ads2_df limit 5;",locals())
#%storage view --object $bucket_object

##### The following two both work and produce the same result

In [None]:
##list(bucket.items())  ## This can be a long list

In [None]:
##%%storage list --bucket $bucket_path

##### This lists all files in the parent specified

In [None]:
##%%storage list  --bucket gs://steve-temp   

##### Other development

In [None]:
#bucket_object = bucket_path + '/ClickSample_out1.csv'
#bucket_object2 = bucket_path + '/ClickSample_out1.csv'
#bucket = storage.Bucket(bucket_path)
#bucket.create()

#bucket.exists()
#project
bucket_path

In [None]:
##%%bash
##gsutil cp 'ads_out1.csv' 


#print(project)
#print(bucket_name)
#print('bucket path is:', bucket_path)
#print('bucket object/table is:', bucket_object)

In [None]:
#bucket_item = bucket.item('ClickSample_out1.csv')
#%storage write -h
#%storage write --variable CR_merge --object $bucket_object
#type(bucket_item)
#print(bucket_item)
#bucket_item.exists()
#list(bucket_item.items())

##### Execute CLI commands in the VM

In [None]:
%%bash
ls -al
pwd
head ClickSample_out1.csv