In [3]:
import matplotlib 
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

from __future__ import division
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

## Cleaning and overview of the data. 

In [4]:
MyFile = r"C:\Users\Bill\Desktop\Tahzoo.com Redesign\GoogleAnalytics\Experiments\labels_categories_actions.tsv"
d1 = pd.read_csv(MyFile, sep="\t")
d1.head()

Unnamed: 0,eventLabel,eventCategory,users
1,GA1.2.1000767845.1461767781,::ABOUT,1
2,GA1.2.1000767845.1461767781,column twelve::BRAD HEIDEMANN Chief Executive ...,1
3,GA1.2.1001075852.1456520683,::Clients,1
4,GA1.2.1001075852.1456520683,slick-next::Next,1
5,GA1.2.1001361534.1458136980,::ABOUT,1


I'm going to use the "Apply" Button as a proxy for a job seeker. By doing this I'm saying: "A job seeker is someone who clicks on the "Apply" button at least once. You could take any token from the event Category (SUBMIT, PHONE NOW, or any other DOM item that was sent to Google Analytics).

In [5]:
d1[d1['eventCategory'].apply(lambda x: "Apply" in x)]

Unnamed: 0,eventLabel,eventCategory,users
936,GA1.2.1099344678.1458727024,row::Apply,1
1153,GA1.2.1128186228.1465200792,white-button button::Apply,1
1913,GA1.2.120872653.1463487618,white-button button::Apply,1
2601,GA1.2.1292233588.1457906393,white-button button::Apply,1
3514,GA1.2.1385108706.1464723179,row::Apply,1
5651,GA1.2.1619902006.1462818168,white-button button::Apply,1
7726,GA1.2.184898488.1461943698,row::Apply,1
8745,GA1.2.1964129312.1460489858,white-button button::Apply,1
9859,GA1.2.2085630033.1457500258,white-button button::Apply,1
14640,GA1.2.663806719.1459958107,white-button button::Apply,1


In [21]:
d1["JobSeeker"] = d1["eventCategory"].apply(lambda x: "Yes" if "Apply" in x else "No")
d1["cleanText"] = d1["eventCategory"].apply(lambda x: x.replace(" ","").replace("::",""))
d1.head()

Unnamed: 0,eventLabel,eventCategory,users,JobSeeker,cleanText
1,GA1.2.1000767845.1461767781,::ABOUT,1,No,ABOUT
2,GA1.2.1000767845.1461767781,column twelve::BRAD HEIDEMANN Chief Executive ...,1,No,columntwelveBRADHEIDEMANNChiefExecutiveOfficer
3,GA1.2.1001075852.1456520683,::Clients,1,No,Clients
4,GA1.2.1001075852.1456520683,slick-next::Next,1,No,slick-nextNext
5,GA1.2.1001361534.1458136980,::ABOUT,1,No,ABOUT


In [29]:
df = pd.DataFrame()
list_of_ids = np.unique(d1.eventLabel)
for iter,id in enumerate(list_of_ids):
    allEvents = d1[d1["eventLabel"]==id]["cleanText"].tolist()
    allJobs = d1[d1["eventLabel"]==id]["JobSeeker"].tolist()
    df.loc[iter,"Events"] = " ".join(allEvents)
    if "Yes" in allJobs:
        df.loc[iter,"JobSeeker"] = "Yes"
    else:
        df.loc[iter,"JobSeeker"] = "No"

In [58]:
pd.concat([df[df["JobSeeker"]=="Yes"].head(10),df[df["JobSeeker"]=="No"].head(10)])

Unnamed: 0,Events,JobSeeker
234,"Join MILTONKEYNES,UK readmore job-titleng-bind...",Yes
291,SeniorUXDesignerAvailableinDCandSeattle Weseek...,Yes
461,About readmore columntwelveEXCELLENCEDELIVERYM...,Yes
649,Join Overseeandcontributetosolutionplandefinit...,Yes
870,"About Blog Join So,YouNeedaContentModel? Wesee...",Yes
1423,About Tahzooisoneofthelargestandfastest-growin...,Yes
1917,About Blog BusinessOperationsAnalystAvailablei...,Yes
2184,ABOUT About BLOG Clients JOIN Join SERVICES ac...,Yes
2473,About job-titleng-bindingJoinusasaGlobalCorpor...,Yes
3566,Join JoinusasSalesOperationsManagerinDC Tahzoo...,Yes


In [33]:
len(df[df['JobSeeker']=="Yes"])/len(df)

0.0032064985036340315

## Converting the events to lists of tokens
In order to predict using the model you have to turn the terms in the text into a 'bag of words' or collection of unigrams. Then the presense or absens of that unigram can be used to predict outcomes. 

In [73]:
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(df.Events.tolist())
y = df.JobSeeker.tolist()
X

<4678x9565 sparse matrix of type '<type 'numpy.int64'>'
	with 70913 stored elements in Compressed Sparse Row format>

In [74]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [63]:
Counter(X.toarray()[1])

Counter({0: 9562, 1: 3})

# Modeling
I'm just using a Decision Tree Classifier and Logistic Regression as an examples. Further testing of live datasets would show what the best features are. 

## Decision Tree Classifier
Because this model draws a series of decisions around the dataset it will always be 100% accurate. It may have lower performance on newly introduced users. 

In [118]:
DTclf = DecisionTreeClassifier(random_state=0)
DTclf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [122]:
DTclf.predict(X)
df["prediction"] = DTclf.predict(X)

In [123]:
def get_results_matrix(df):  #I'll be using this function over and over again. 
    Right_and_Yes = df[(df["JobSeeker"]==df["prediction"])&(df["JobSeeker"]=="Yes")]
    Right_and_No = df[(df["JobSeeker"]==df["prediction"])&(df["JobSeeker"]=="No")]
    Wrong_and_Yes = df[(df["JobSeeker"]!=df["prediction"])&(df["JobSeeker"]=="Yes")]
    Wrong_and_No = df[(df["JobSeeker"]!=df["prediction"])&(df["JobSeeker"]=="No")]
    
    results1 = pd.DataFrame(index=["Right_and_Yes","Right_and_No","Wrong_and_Yes","Wrong_and_No"], columns=["count","percent"])
    results1.loc["Right_and_Yes",["count","percent"]] = [len(Right_and_Yes),len(Right_and_Yes)/len(df)]
    results1.loc["Right_and_No",["count","percent"]] = [len(Right_and_No),len(Right_and_No)/len(df)]
    results1.loc["Wrong_and_Yes",["count","percent"]] = [len(Wrong_and_Yes),len(Wrong_and_Yes)/len(df)]
    results1.loc["Wrong_and_No",["count","percent"]] = [len(Wrong_and_No),len(Wrong_and_No)/len(df)]
    results1

    results2 = pd.DataFrame(index=["Guess:Right","Guess:Wrong"], columns=["Applied:Yes","Applied:No"])
    results2.loc["Guess:Right","Applied:Yes"] = len(Right_and_Yes)
    results2.loc["Guess:Wrong","Applied:No"] = len(Wrong_and_No)
    results2.loc["Guess:Wrong","Applied:Yes"] = len(Wrong_and_Yes)
    results2.loc["Guess:Right","Applied:No"] = len(Right_and_No)
    results2
    
    print results2["Applied:Yes"].apply(lambda x: x/results2["Applied:Yes"].sum())
    return results1,results2


In [124]:
results1,results2 = get_results_matrix(df)
results1

Guess:Right    1.0
Guess:Wrong    0.0
Name: Applied:Yes, dtype: float64


Unnamed: 0,count,percent
Right_and_Yes,15,0.0032065
Right_and_No,4663,0.996794
Wrong_and_Yes,0,0.0
Wrong_and_No,0,0.0


In [125]:
results2

Unnamed: 0,Applied:Yes,Applied:No
Guess:Right,15,4663
Guess:Wrong,0,0


## Logistic Regression

In [126]:
from sklearn.linear_model import LogisticRegression

In [127]:
LRclf = LogisticRegression(random_state=0)
LRclf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [128]:
LRclf.predict(X)

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], 
      dtype='|S3')

In [129]:
df["prediction"] = LRclf.predict(X)
#df["estimateProbability"] = LRclf.predict_log_proba(X)
pd.concat([df[df["JobSeeker"]=="Yes"].head(10),df[df["JobSeeker"]=="No"].head(10)])

Unnamed: 0,Events,JobSeeker,prediction
234,"Join MILTONKEYNES,UK readmore job-titleng-bind...",Yes,No
291,SeniorUXDesignerAvailableinDCandSeattle Weseek...,Yes,Yes
461,About readmore columntwelveEXCELLENCEDELIVERYM...,Yes,Yes
649,Join Overseeandcontributetosolutionplandefinit...,Yes,Yes
870,"About Blog Join So,YouNeedaContentModel? Wesee...",Yes,Yes
1423,About Tahzooisoneofthelargestandfastest-growin...,Yes,No
1917,About Blog BusinessOperationsAnalystAvailablei...,Yes,Yes
2184,ABOUT About BLOG Clients JOIN Join SERVICES ac...,Yes,Yes
2473,About job-titleng-bindingJoinusasaGlobalCorpor...,Yes,No
3566,Join JoinusasSalesOperationsManagerinDC Tahzoo...,Yes,Yes


In [130]:
results1,results2 = get_results_matrix(df)
results1

Guess:Right    0.6
Guess:Wrong    0.4
Name: Applied:Yes, dtype: float64


Unnamed: 0,count,percent
Right_and_Yes,9,0.0019239
Right_and_No,4663,0.996794
Wrong_and_Yes,6,0.0012826
Wrong_and_No,0,0.0


In [131]:
results2

Unnamed: 0,Applied:Yes,Applied:No
Guess:Right,9,4663
Guess:Wrong,6,0


In [144]:
len(LRclf.coef_[0])==len(X.toarray()[1])

True

In [148]:
token_values = pd.DataFrame()

token_values["names"] = vectorizer.get_feature_names()
token_values["score"] = LRclf.coef_[0]
token_values.sort("score",ascending=False).head()



Unnamed: 0,names,score
9168,white,1.415791
1531,buttonbuttonapply,1.415791
6558,selectoregberthendrikschiefstrategyofficer,0.956176
5562,paddedclientpartneravailablein,0.908382
6234,rowapply,0.810663
