In [2]:
# Goals
# I have a dataset with 28126 rows and 183 features. The data is for a "mostly" e-commerce company located in Boston that sells
# furniture and home goods.

# For this company, my job/goal as a data scientist is to predict who will buy (classification) and how much (regression). 
# It is scored with ROC/AUC and RMSE


In [3]:
# Python Library Imports 
import pandas as pd  # for Dataframes
import numpy as np  # for arrays
import matplotlib.pyplot as plt  # for plotting visuals

import xgboost as xgb  # boosted decision trees
import sklearn  # for machine learning
from sklearn import tree 
from sklearn import preprocessing

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import log_loss, accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from IPython.display import Image

import pickle # persistant objects aka classifier after training

  from numpy.core.umath_tests import inner1d


In [4]:
# Styling
pd.set_option('notebook_repr_html', True)
pd.set_option('max_columns', 50)
# pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 40)
%matplotlib inline 
# pd.set_option('display.max_columns', None)

In [21]:

Globalslog = {}
def printLog():
    print(Globalslog)

In [231]:
# Data Imports
csvfile = "df_training_scholarjet.csv"  # dataset to be used
dataframe_all = pd.read_csv(csvfile, index_col=0)  # read csvs file

dictfile = "Feature_Dictionary.xlsx"
feat_dict = pd.read_excel(dictfile)


# Data manipulation
dataframe_all = dataframe_all.drop(["cuid"], axis=1, inplace=False)  # drop unnecessary columns

df_shape = dataframe_all.shape  # save shape
print(df_shape)




# dataframe_all = dataframe_all.drop(["revenue_30"], axis=1, inplace=False)  # Do classificiation first ***

# dataframe_all = sklearn.utils.shuffle(dataframe_all) # shuffle

dataframe_all.index = range(df_shape[0])  # reindex dataframe
 
# print(dataframe_all.head())

(28126, 182)


In [7]:
# Clean Data
# 1. Change categorical data into numerical. Make sure to mainting data integrity for hierarchical categorical data. 
# probably use dummy vars for other categorical data.
# 2. Impute for missing numbers. I have a prediction that the more filled out the data entry is that it might mean that
# the customer is more engaged and likely to buy. Test after cleaning


In [118]:
# Data Cleaning
# Should I automate the categorical columns that have rankings?

# Find all columns with categorical aka object type data
categorical_data_list = []
for cat in range(df_shape[1]):
    if dataframe_all.dtypes[cat] == 'object':
        categorical_data_list.append(dataframe_all.columns[cat])

# Analyze the categorical data by finding Totals in each Unique categories
# how many are positives where customers bought
# total revenues to see if spending is different by group
def categoricalAnalysis(df, categList, convert, revenue):
    categ_list_len = len(categList)
    
    # lists to add to the returned dataframe
    categTotals = []
    categConvert = []
    categRevenues = []
    percentBuy = []
    
#     for all of the categorical columns
    for cat in range(categ_list_len):
        categTotalsd = {}
        categConvertd = {}
        categRevenuesd = {}
        percentBuyd = {}

        for index in range(len(df[categList[cat]].unique())):
            uniqueCat = df[categList[cat]].unique()[index]  # uniqueCat = iterates through actual unique category list

            categTotalsd[uniqueCat] = df[categList[cat]].value_counts()[uniqueCat]  # totals in cat
    
            #Convert for each cat
            tempDfcon = (df[convert]>0) & (df[categList[cat]] == uniqueCat)
            dfConvert = df[tempDfcon]
            categConvertd[uniqueCat] = dfConvert[convert].sum().round(2)            
        
            #Revenues for each cat
            tempDf = (df[revenue]>0) & (df[categList[cat]] == uniqueCat)
            dfUniqueAbove0 = df[tempDf]
            categRevenuesd[uniqueCat] = dfUniqueAbove0[revenue].sum().round(2)

            percentBuyd[uniqueCat] = categConvertd[uniqueCat] / categTotalsd[uniqueCat] 

            
#         append to lists     
        categTotals.append(categTotalsd)    
        categConvert.append(categConvertd)
        categRevenues.append(categRevenuesd)
        percentBuy.append(percentBuyd)
    d = {'CatColumns':categTotals, 'UniqueCat':categConvert, 'RevCat':categRevenues, 'PercentBuy':percentBuy}
    categorical_data_df = pd.DataFrame(d)
    return categorical_data_df

# call function to create new dataframe of all the raw info on categoricals
analysisDf = categoricalAnalysis(dataframe_all, categorical_data_list, "convert_30", "revenue_30")        
# print(analysisDf.values)
analysisDf.values                     

array([[{'Onboarding': 9986, 'Retention': 3392, 'Unmanaged': 14748},
        {'Onboarding': 1496, 'Retention': 515, 'Unmanaged': 917},
        {'Onboarding': 1168330.51, 'Retention': 446126.16, 'Unmanaged': 496090.17},
        {'Onboarding': 0.14980973362707792, 'Retention': 0.15182783018867924, 'Unmanaged': 0.06217792243016002}],
       [{'Active': 9918, 'Enrolled': 18199, 'In Progress': 8, 'Unconfirmed': 1},
        {'Active': 1699, 'Enrolled': 1228, 'In Progress': 1, 'Unconfirmed': 0},
        {'Active': 1171406.12, 'Enrolled': 938996.25, 'In Progress': 144.46, 'Unconfirmed': 0.0},
        {'Active': 0.171304698527929, 'Enrolled': 0.06747623495796472, 'In Progress': 0.125, 'Unconfirmed': 0.0}],
       [{'Business': 22748, 'Trade': 5378},
        {'Business': 2273, 'Trade': 655},
        {'Business': 1496237.29, 'Trade': 614309.54},
        {'Business': 0.0999208721645859, 'Trade': 0.12179248791372257}],
       [{'US': 25799, 'CA': 2327}, {'US': 2650, 'CA': 278},
        {'US': 19106

In [58]:
# Examine entire dataset to find percent of buyers to total
aboveAnalysis = dataframe_all['revenue_30'] > 0
aboveAnalysis = dataframe_all[aboveAnalysis]
print(len(aboveAnalysis), "out of a total of", len(dataframe_all), "bought" )
print("{:.2%}".format(len(aboveAnalysis) / len(dataframe_all)), "of all customers bought/converted")
print("The average a customer spend was:", "${:,.2f}".format(aboveAnalysis["revenue_30"].sum().round(2)))
print("The average a customer spend was:", "${:.2f}".format(aboveAnalysis["revenue_30"].sum().round(2)/len(aboveAnalysis)))
print("The average a population spends was:", "${:.2f}".format(aboveAnalysis["revenue_30"].sum().round(2)/len(dataframe_all)))
print("The total a population can spend:", "${:,.2f}".format(aboveAnalysis["revenue_30"].sum().round(2)/len(aboveAnalysis)*len(dataframe_all)))


# The mean score used for all those shitty small categories
pureMean = len(aboveAnalysis) / len(dataframe_all)


2928 out of a total of 28126 bought
10.41% of all customers bought/converted
The average a customer spend was: $2,110,546.84
The average a customer spend was: $720.82
The average a population spends was: $75.04
The total a population can spend: $20,273,647.69


In [236]:
# Data Cleaning

#input mean scores into categories
#for all examples where the data has less than 1000 in the categories, input the pureMean

def replaceCategCols(df):
    replaceDf = pd.DataFrame(df, copy=True)
    for i, cat in enumerate(categorical_data_list):
        for j, unique in enumerate (replaceDf[cat].unique()):
    #         print(unique, analysisDf.values[i][0][unique] < 1000)
            if (analysisDf.values[i][0][unique]< 1000):
                replaceDf[cat].replace(unique, pureMean,inplace=True)
            else:
                replaceDf[cat].replace(unique, analysisDf.values[i][3][unique],inplace=True)
    return replaceDf       

replaceDf = replaceCategCols(dataframe_all)

In [262]:
# analysisDf.values


In [263]:
#WIP

# Data Cleaning
# Feature Group - Order
# Testing my knowledge of what I already know questions
# if you recently bought something, how likely are you going to buy something again?
#     it would depend on if you were an individual consuemr or a b2b
#     1 requires 1 time use the other needs to restock
    
# how do you correlate domain knowledge into your model





# replaceDf[replaceDf.columns[12:37]].loc[:10]
# len(replaceDf.columns[12:37])

replaceDf[replaceDf.columns[12]]
print(pd.crosstab(replaceDf['numbamorder'], replaceDf['convert_30']))
print(pd.crosstab(replaceDf['numselforder'], replaceDf['convert_30']))
# number vs sum
# bam vs self
# total vs average deviation?




convert_30      0     1
numbamorder            
0.0          6987  1254
1.0           933   203
2.0           163    48
3.0            50    20
4.0            15    10
5.0             6     1
6.0             1     3
7.0             2     0
8.0             2     0
13.0            1     0
14.0            1     0
18.0            0     2
convert_30       0    1
numselforder           
0.0            715  116
1.0           4540  636
2.0           1443  267
3.0            621  163
4.0            319  103
5.0            189   63
6.0            129   44
7.0             50   31
8.0             49   27
9.0             34   20
10.0            12    9
11.0            13   12
12.0            11    9
13.0            12    7
14.0             5    9
15.0             4    4
16.0             5    1
17.0             2    1
18.0             1    3
19.0             1    3
20.0             0    2
21.0             0    2
22.0             0    3
23.0             2    1
25.0             2    0
27.0            

In [275]:
# WIP
# Modeling

baseModel = pd.DataFrame(replaceDf, copy = True)
baseModel.fillna(0, inplace=True)

y = pd.DataFrame(baseModel['convert_30'], copy=True)
X = pd.DataFrame(baseModel, copy=True)
X.drop(['convert_30', 'revenue_30'], axis=1, inplace=True)


In [276]:
# splitting data
seed = 100
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)




In [22]:
# On first run

printLog()









{}


In [278]:
#Archive


In [13]:
# Convert categories to continuous
# Use dummy's, dict, or mean?

# FUNCTIONS #

# function for dummy's
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
        
    return output

# function for dict entire dataset
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
            df[column] = list(map(convert_to_int, df[column]))
    return df

# # function for dict individual column in dataset
# def handle_non_numerical_data_individual(df_col, df):
#     text_digit_vals = {}
#     def convert_to_int(val):
#         return text_digit_vals[val]
#     if df_col.dtype != np.int64 and df_col.dtype != np.float64:
#         column_contents = df_col.values.tolist()
#         unique_elements = set(column_contents)
# #         print(unique_elements)
#         x = 0
#         for unique in unique_elements:
#             if unique not in text_digit_vals:
#                 text_digit_vals[unique] = x
#                 x+=1

#         df_col = list(map(convert_to_int, df_col))
#     print(text_digit_vals)
#     return df_col

def meanPreprocessor(df, colName, colComprRegr, colComprBin):
    print("<< colName >>")
    print(colName)
    print()
    print("<< categories >>")  
    print(df[colName].unique())  # be careful of how it orders caregories  
    print()
    print("<< cat crosstab >>")
    print(pd.crosstab(df[colName], df[colComprBin]))
    print()
    print("<< cat totals >>")
    print(df[colName].value_counts())
#     print(df[colName].value_counts(normalize=True))
    print()
    
    print("<< cat total Revenues >>")
    revTotals = {}
    for cat in range(len(df[colName].unique())): 
        revTotals[df[colName].unique()[cat]] = 0
    print(revTotals)
    
    
#     revTotals["Test"]  = 0
#     revTotals["Test2"]  = 0
#     revTotals["Test2"]  = 4
#     revTotals["Test3"]  = [4]
#     revTotals["Test3"].append(0)

#     print(df[colName].unique()[0])
#     print(len(df))
    for row in range(230):
        if(df[colComprRegr][row]>0):
            print(row, df[colComprRegr][row])
        
        
        
        
        
        
        revTotals[df[colName].unique()[cat]] = 0
#     print(revTotals.rename_axis('1','2,','3'))
#     for row in range(len(df)):
        
        
    
# print(dataframe_all['convert_30'].index.name)
meanPreprocessor(dataframe_all, 'roll_up', 'revenue_30', 'convert_30')
# dataframe_all
# print(dataframe_all.groupby(['revenue_30', 'roll_up']).size().index[4])
# print(dataframe_all.groupby(['revenue_30', 'roll_up']).size().index[4][0]>0)

# X['numorderone']
# dataframe_all['numorderone']

# X['roll_up'].unique()

<< colName >>
roll_up

<< categories >>
['Onboarding' 'Retention' 'Unmanaged']

<< cat crosstab >>
convert_30      0     1
roll_up                
Onboarding   8490  1496
Retention    2877   515
Unmanaged   13831   917

<< cat totals >>
Unmanaged     14748
Onboarding     9986
Retention      3392
Name: roll_up, dtype: int64

<< cat total Revenues >>
{'Onboarding': 0, 'Retention': 0, 'Unmanaged': 0}
7 323.97
24 34.94
34 4398.06
40 1905.63
42 2022.69
43 489.99
49 7632.7
55 287.99
58 895.89
61 357.95
62 699.99
68 2840.27
78 63.99
91 57.99
97 111.98
99 468.14
100 927.94
110 133.98
115 209.94
118 376.1
121 93.06
122 164.5713
124 13972.12
128 246.64
132 1129.3
138 183.99
141 255.16
150 162.93
153 1309.83
154 30.98
158 114.06
169 203.75
170 468.03
189 610.44
197 1072.08
201 102.98
202 40.62
203 228.99
207 144.52
208 173.44
211 380.96
223 372.81


In [14]:
       
#     for uniqueCat in range(len(df[colName].unique())):

#     print(df.groupby([colComprRegr, colName]).size())
#     placeholder = df.groupby([colComprRegr, colName]).size().index[0]
#     for row in range(100): # len(df)
#         print(df.groupby([colComprRegr, colName]).size().index)
#         print(placeholder[0])
#         if(placeholder[0]>0):
#             print(placeholder[row])
     
            
            
#             print(df.groupby([colComprRegr, colName]).size().index[0][0]>0)
#             print(revTotals)
#         print(df[colName].unique()[uniqueCat])
    
#     print("done")
#     print(df.groupby(['revenue_30', 'roll_up']).size().index[4])
    
    
    
    
#     print(dataframe_all.groupby(['revenue_30', 'roll_up']).size().index[4][0]>0)
    
#     print("cnt rev ea:", )
#     print()
#     print(X[colName].value_counts())
#     print(df[colName].value_counts())
#     print(df[colName].value_counts()[1])
#     print(len(column.unique()))
#     print(column.value_counts())

In [16]:
# dataframe_all
# X
# y
cat = X['numorderone']
# cat = X['enrollmentmethod']

# cat.describe()
# len(cat.unique())
# type(cat.unique())
# cat.dtype

# type(cat)
# cat.index.values
# print(cat.unique())
# cat.value_counts()
# cat.isna().sum()
dataframe_all.iloc[1].isna().sum()


20

In [17]:
# FUNCTION CALLS

# All the categorical columns from the dataset
feat_cols = ['roll_up','currentstatus','companytypegroup',
              'team','customersource','accrole',
              'num_employees','num_purchases_year',
              'cost_purchases_year','enrollmentmethod']

# Create Dataframes for each type way of addressing Categorical data
dummy_df = pd.DataFrame(dataframe_all, copy=True)
dict_df = pd.DataFrame(dataframe_all, copy=True)
mean_df = pd.DataFrame(dataframe_all, copy=True)

# DUMMY
# dummy_df = preprocess_features(dummy_df)
# print ("Processed feature columns ({} total features):\n{}".format(len(dummy_df.columns), list(dummy_df.columns)))

handle_non_numerical_data(dict_df)
dict_df.head()







Unnamed: 0,revenue_30,roll_up,currentstatus,companytypegroup,team,customersource,accrole,num_employees,num_purchases_year,cost_purchases_year,enrollmentmethod,numorderone,numorderthreeone,numorderseventhree,numorderthirtyseven,numordersixtythirty,numorderyearsixty,sumrevone,sumrevthreeone,sumrevseventhree,sumrevthirtyseven,sumrevsixtythirty,sumrevyearsixty,numbamorder,numselforder,...,totalcalldurationthreeone,totalcalldurationseventhree,totalcalldurationthirtyseven,totalcalldurationsixtythirty,totalcalldurationyearsixty,decmakerflagone,decmakerflagsevenone,decmakerflagfourteenseven,decmakerflagthirtyfourteen,percsecondsinbound,percemailopenedone,percemailopenedthreeone,percemailopenedseventhree,percemailopenedthirtyseven,percemailopenedsixtythirty,percemailopenedyearsixty,percemailclickedone,percemailclickedthreeone,percemailclickedseventhree,percemailclickedthirtyseven,percemailclickedsixtythirty,percemailclickedyearsixty,currentapplicability,numemaillist,dayssinceenrollment
0,0.0,1,2,0,1,0,1,4,3,1,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,59.99,0.0,0.0,0.0,1.0,...,0.0,0.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.8,0.809524,0.0,0.0,0.0,0.0,0.0,0.190476,0.0,0.0,3.0,1.0,17
1,0.0,1,2,0,1,12,1,2,2,0,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,126.48,0.0,1.0,...,0.0,55.0,0.0,0.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.02,0.0,0.0,0.0,0.04,0.0,0.02,3.0,1.0,70
2,0.0,1,2,0,1,12,1,1,4,5,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,237.98,0.0,0.0,0.0,1.0,...,0.0,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.113636,0.086207,0.106195,0.0,0.0,0.0,0.0,0.0,0.00177,13.0,1.0,27
3,0.0,1,0,0,1,3,1,4,4,1,5,,,,,,,,,,,,,,,...,0.0,0.0,67.0,0.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.1,,,25
4,0.0,1,0,0,1,15,1,0,5,3,5,,,,,,,,,,,,,,,...,0.0,0.0,50.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.047619,0.058824,0.072072,0.0,0.0,0.0,0.009524,0.016807,0.003465,8.333333,3.0,33


In [18]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)


# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

NameError: name 'X_all' is not defined

In [None]:
decision_tree = DecisionTreeClassifier(random_state=seed)

#train classifier
decision_tree.fit(X_train, y_train)

#predict output
decision_tree_y_pred = decision_tree.predict(X_test)
decision_tree_y_pred_prob = decision_tree.predict_proba(X_test)

#evaluation
decision_tree_accuracy = accuracy_score(y_test, decision_tree_y_pred)
decision_tree_logloss = log_loss(y_test, decision_tree_y_pred_prob)

print(" == Decision Tree ==")
print("Accuracy: {0:.2f}".format(decision_tree_accuracy))
print("Log loss: {0:.2f}".format(decision_tree_logloss))
print("Number of nodes created: {}".format(decision_tree.tree_.node_count))


