In [1]:
import pandas as pd


pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', -1)
pd.set_option('mode.chained_assignment', None)

WORK_CUST = './AdvWorksCusts.csv'
AVE_SPEND = './AW_AveMonthSpend.csv'
BIKE_BUYER = './AW_BikeBuyer.csv'
TEST = './AW_test.csv'

LABEL = 'BikeBuyer'

In [2]:
work_cust_df = pd.read_csv(WORK_CUST)
ave_spend_df = pd.read_csv(AVE_SPEND)
bike_buyer_df = pd.read_csv(BIKE_BUYER)

val = pd.read_csv(TEST)


### Feature Engineering

In [3]:
#Keep the last record for each unique customerID
work_cust_df = work_cust_df.drop_duplicates(subset='CustomerID', keep='last')
ave_spend_df = ave_spend_df.drop_duplicates(subset='CustomerID', keep='last')
bike_buyer_df = bike_buyer_df.drop_duplicates(subset='CustomerID',keep='last')

In [4]:
# Joing 3 dataframes into one DF
all_df = pd.merge(work_cust_df, ave_spend_df, left_on="CustomerID", right_on="CustomerID", how='left')
all_df = pd.merge(all_df, bike_buyer_df, left_on="CustomerID", right_on="CustomerID", how='left')

In [5]:
all_df.head(2)

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AveMonthSpend,BikeBuyer
0,11000,,Jon,V,Yang,,3761 N. 14th St,,Rockhampton,Queensland,Australia,4700,1 (11) 500 555-0162,1966-04-08,Bachelors,Professional,M,M,1,0,0,2,137947,89,0
1,11001,,Eugene,L,Huang,,2243 W St.,,Seaford,Victoria,Australia,3198,1 (11) 500 555-0110,1965-05-14,Bachelors,Professional,M,S,0,1,3,3,101141,117,1


In [6]:
val.head(2)

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,United States,94536,133-555-0128,1/5/1945,Bachelors,Management,F,S,0,2,0,5,86931
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,Canada,V2W 1W2,252-555-0173,10/4/1964,Bachelors,Skilled Manual,M,M,1,2,2,4,100125


In [7]:
keep_cols = ['CustomerID', 'CountryRegionName', 'BirthDate', 
             'Education', 'Occupation',
             'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
             'NumberChildrenAtHome', 'TotalChildren', 
             'YearlyIncome', 'BikeBuyer']

keep_cols_val = ['CustomerID', 'CountryRegionName', 'BirthDate', 
             'Education', 'Occupation',
             'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
             'NumberChildrenAtHome', 'TotalChildren', 
             'YearlyIncome']

In [8]:
df = all_df[keep_cols]
val = val[keep_cols_val]

In [9]:
from datetime import datetime
from datetime import date


def calculate_age(born):
    try:
        born = datetime.strptime(born, "%Y-%m-%d").date()
        particular_date = datetime(1998, 1, 1)
        return particular_date.year - born.year - ((particular_date.month, particular_date.day) < (born.month, born.day))
    except:
        return 0

    
def calculate_age_val(born):
    try:
        born = datetime.strptime(born, "%d/%m/%Y").date()
        particular_date = datetime(1998, 1, 1)
        return particular_date.year - born.year - ((particular_date.month, particular_date.day) < (born.month, born.day))
    except:
        return 0   
    
    
def age_cat(age):
    if age < 25:
        return 'under 25'
    elif age >= 25 and age <= 45:
        
        return 'between_25_and_45'
    
    elif age > 45 and age <= 55:
        return 'between_44_and_55'
    
    elif age > 55:
        return 'over_55'


In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pandas.api.types import is_numeric_dtype

from sklearn import preprocessing

def preprocess(df):
    #create age category features
    df['age_cat'] = df['age'].apply(age_cat)
    df = df.drop('BirthDate',axis = 1)
    
    #One hot encode categorical features
    df = pd.concat([df,pd.get_dummies(df['age_cat'], prefix='category')],axis=1)
    df.drop(['age_cat'],axis=1, inplace=True)

    df = pd.concat([df,pd.get_dummies(df['CountryRegionName'], prefix='country')],axis=1)
    df.drop(['CountryRegionName'],axis=1, inplace=True)    
    
    df = pd.concat([df,pd.get_dummies(df['Education'], prefix='education')],axis=1)
    df.drop(['Education'],axis=1, inplace=True)  
    
    df = pd.concat([df,pd.get_dummies(df['Occupation'], prefix='job')],axis=1)
    df.drop(['Occupation'],axis=1, inplace=True)
    
    df = pd.concat([df,pd.get_dummies(df['MaritalStatus'], prefix='job')],axis=1)
    df.drop(['MaritalStatus'],axis=1, inplace=True)  
    
    #Fix male/female
    df['Gender'] = df['Gender'].apply(lambda x: 'male' if x == 'M' else 'female')
    
    df = pd.concat([df,pd.get_dummies(df['Gender'], prefix='gender')],axis=1)
    df.drop(['Gender'],axis=1, inplace=True)    
    
    
    #Drop any non numeric rows in Income
    df = df[df.YearlyIncome != 'Professional']
    
    df['YearlyIncome'] = df['YearlyIncome'].values.astype(float)

    # Create a minimum and maximum processor object
    scaler = preprocessing.MinMaxScaler()

    df[['YearlyIncome']] = scaler.fit_transform(df[['YearlyIncome']])

    df.drop(['CustomerID'],axis=1, inplace=True)    
    
    return df
    



In [11]:
df['age'] = df['BirthDate'].apply(calculate_age)
val['age'] = val['BirthDate'].apply(calculate_age_val)

df = preprocess(df)
val = preprocess(val)

  result = method(y)
  result = method(y)


In [12]:
df.head(1)

Unnamed: 0,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,age,category_between_25_and_45,category_between_44_and_55,category_over_55,category_under 25,country_Australia,country_Canada,country_France,country_Germany,country_United Kingdom,country_United States,education_Bachelors,education_Graduate Degree,education_High School,education_Partial College,education_Partial High School,job_Clerical,job_Management,job_Manual,job_Professional,job_Skilled Manual,job_M,job_S,gender_female,gender_male
0,1,0,0,2,0.686872,0,31,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1


In [13]:
val.head(1)

Unnamed: 0,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,age,category_between_25_and_45,category_between_44_and_55,category_over_55,category_under 25,country_Australia,country_Canada,country_France,country_Germany,country_United Kingdom,country_United States,education_Bachelors,education_Graduate Degree,education_High School,education_Partial College,education_Partial High School,job_Clerical,job_Management,job_Manual,job_Professional,job_Skilled Manual,job_M,job_S,gender_female,gender_male
0,0,2,0,5,0.437975,52,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0


### Modelling

In [546]:
feature_cols = ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome',
               'TotalChildren', 'YearlyIncome', 'age',
               'category_between_25_and_45', 'category_between_44_and_55',
               'category_over_55', 'category_under 25', 'country_Australia',
               'country_Canada', 'country_France', 'country_Germany',
               'country_United Kingdom', 'country_United States',
               'education_Bachelors ', 'education_Graduate Degree',
               'education_High School', 'education_Partial College',
               'education_Partial High School', 'job_Clerical', 'job_Management',
               'job_Manual', 'job_Professional', 'job_Skilled Manual', 'job_M',
               'job_S', 'gender_female', 'gender_male']

In [549]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

#Balance Dataset
# Separate majority and minority classes
df_majority = df[df.BikeBuyer==0]
df_minority = df[df.BikeBuyer==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,
                                 n_samples=len(df_minority)) 
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
print(df_downsampled.BikeBuyer.value_counts())





X = df_downsampled[feature_cols] 
y = df_downsampled[LABEL]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

1    5455
0    5455
Name: BikeBuyer, dtype: int64


## SVM

In [550]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [551]:
y_pred = svclassifier.predict(X_test)


In [552]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[846 250]
 [327 759]]
              precision    recall  f1-score   support

           0       0.72      0.77      0.75      1096
           1       0.75      0.70      0.72      1086

    accuracy                           0.74      2182
   macro avg       0.74      0.74      0.74      2182
weighted avg       0.74      0.74      0.74      2182



## Neural Network

In [578]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,500,200),max_iter=500, learning_rate_init=0.001)


In [579]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 500, 200), learning_rate='constant',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [580]:
predictions = mlp.predict(X_test)

In [581]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[850 246]
 [384 702]]
              precision    recall  f1-score   support

           0       0.69      0.78      0.73      1096
           1       0.74      0.65      0.69      1086

    accuracy                           0.71      2182
   macro avg       0.71      0.71      0.71      2182
weighted avg       0.71      0.71      0.71      2182



## Inference

In [582]:
svm_preds = svclassifier.predict(val)


In [523]:
infer_pred = mlp.predict(val)