In [0]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Description Of Data
In 1998, the Adventure Works Cycles company collected a large volume of data about their existing customers, including demographic features and information about purchases they have made. The company is particularly interested in analyzing customer data to determine any apparent relationships between demographic features known about the customers and the likelihood of a customer purchasing a bike. Additionally, the analysis should endeavor to determine whether a customer's average monthly spend with the company can be predicted from known customer characteristics.

In [0]:
#importing pakages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

Creating a classification model that predicts whether or not a customer will purchase a bike. The model should predict bike purchasing for new customers for whom no information about average monthly spend or previous bike purchases is available.

In [0]:
dataset = pd.read_csv('gdrive/My Drive/Colab Notebooks/AW_BikeBuyer.csv')
dataset2 = pd.read_csv('gdrive/My Drive/Colab Notebooks/AW_AveMonthSpend.csv')
dataset3 = pd.read_csv('gdrive/My Drive/Colab Notebooks/AdvWorksCusts.csv')


In [0]:
dataset3.drop_duplicates(subset ="AddressLine1", 
                     keep = False, inplace = True) 

In [0]:
#joining bikebuyer dataset with average monthly test spend dataset
df = pd.concat((dataset['CustomerID'],dataset['BikeBuyer'],dataset2['AveMonthSpend']),axis = 1)
df.head()

Unnamed: 0,CustomerID,BikeBuyer,AveMonthSpend
0,11000,0,89
1,11001,1,117
2,11002,0,123
3,11003,0,50
4,11004,1,95


In [0]:
df1 = df.join(dataset3,how='right',lsuffix='_left', rsuffix='_right')
df1.head()

Unnamed: 0,CustomerID_left,BikeBuyer,AveMonthSpend,CustomerID_right,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
4,11004,1,95,11004,,Elizabeth,,Johnson,,7553 Harness Circle,,Wollongong,New South Wales,Australia,2500,1 (11) 500 555-0131,1968-08-08,Bachelors,Professional,F,S,1,4,5,5,92771
10,11010,0,49,11010,,Jacquelyn,C,Suarez,,7800 Corrinne Court,,East Brisbane,Queensland,Australia,4169,1 (11) 500 555-0169,1964-02-06,Bachelors,Professional,F,S,0,1,0,0,81294
13,11013,0,84,11013,,Ian,M,Jenkins,,7902 Hudson Ave.,,Lebanon,Oregon,United States,97355,817-555-0185,1968-08-06,Bachelors,Management,M,M,1,3,0,2,115859
14,11014,0,49,11014,,Sydney,,Bennett,,9011 Tank Drive,,Redmond,Washington,United States,98052,431-555-0156,1968-05-09,Bachelors,Management,F,S,0,3,0,3,105157
17,11019,1,64,11019,,Luke,L,Lal,,7832 Landing Dr,,Langley,British Columbia,Canada,V3A 4R2,262-555-0112,1978-03-07,High School,Skilled Manual,M,S,0,2,0,0,49455


In [0]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7678 entries, 4 to 16507
Data columns (total 26 columns):
CustomerID_left         7678 non-null int64
BikeBuyer               7678 non-null int64
AveMonthSpend           7678 non-null int64
CustomerID_right        7678 non-null int64
Title                   42 non-null object
FirstName               7678 non-null object
MiddleName              4414 non-null object
LastName                7678 non-null object
Suffix                  1 non-null object
AddressLine1            7678 non-null object
AddressLine2            138 non-null object
City                    7678 non-null object
StateProvinceName       7678 non-null object
CountryRegionName       7678 non-null object
PostalCode              7678 non-null object
PhoneNumber             7678 non-null object
BirthDate               7678 non-null object
Education               7678 non-null object
Occupation              7678 non-null object
Gender                  7678 non-null object
Ma

In [0]:
#dropping independent variales that do not affect dependent variable
data_new = df1.drop(['BikeBuyer','AveMonthSpend','CustomerID_left','CustomerID_right','Title','FirstName','LastName','MiddleName','Suffix','AddressLine1','AddressLine2','City','StateProvinceName','CountryRegionName','PostalCode','PhoneNumber','BirthDate','Education','Occupation'],axis=1)
data_new.head()

Unnamed: 0,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
4,F,S,1,4,5,5,92771
10,F,S,0,1,0,0,81294
13,M,M,1,3,0,2,115859
14,F,S,0,3,0,3,105157
17,M,S,0,2,0,0,49455


In [0]:
#storing new data in X and Y 
X = np.array(data_new)
X[:,1]

array(['S', 'S', 'M', ..., 'S', 'S', 'M'], dtype=object)

In [0]:
Y = np.array(df1['BikeBuyer'])
X[:,0]

array(['F', 'F', 'M', ..., 'F', 'M', 'M'], dtype=object)

In [0]:
#label encoding on X to covert Sex and Marital status values into binary
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
X[:,0]


array([0, 0, 1, ..., 0, 1, 1], dtype=object)

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_Y = LabelEncoder()
X[:,1] = labelencoder_Y.fit_transform(X[:,1])
X[:,1]

array([1, 1, 0, ..., 1, 1, 0], dtype=object)

In [0]:
#Creating training datasets
X_train = X 
y_train = df1['BikeBuyer']

In [0]:
#importing random forest classifier 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(X_train)

In [0]:
from sklearn import metrics
print('Accuracy',metrics.accuracy_score(y_train,y_pred_rf))

Accuracy 1.0


In [0]:
#importing test data set
test_data = pd.read_csv('gdrive/My Drive/Colab Notebooks/AW_test.csv')

In [0]:
CustomerID = test_data['CustomerID']

In [0]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 23 columns):
CustomerID              500 non-null int64
Title                   4 non-null object
FirstName               500 non-null object
MiddleName              284 non-null object
LastName                500 non-null object
Suffix                  1 non-null object
AddressLine1            500 non-null object
AddressLine2            13 non-null object
City                    500 non-null object
StateProvinceName       500 non-null object
CountryRegionName       500 non-null object
PostalCode              500 non-null object
PhoneNumber             500 non-null object
BirthDate               500 non-null object
Education               500 non-null object
Occupation              500 non-null object
Gender                  500 non-null object
MaritalStatus           500 non-null object
HomeOwnerFlag           500 non-null int64
NumberCarsOwned         500 non-null int64
NumberChildrenAtHome   

In [0]:
#dropping independent variales that do not affect dependent variables
data_new_test = test_data.drop(['CustomerID','Title','FirstName','LastName','MiddleName','Suffix','AddressLine1','AddressLine2','City','StateProvinceName','CountryRegionName','PostalCode','PhoneNumber','BirthDate','Education','Occupation'],axis=1)
data_new_test.head()

Unnamed: 0,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,F,S,0,2,0,5,86931
1,M,M,1,2,2,4,100125
2,F,M,1,2,0,4,103985
3,M,M,1,0,0,4,127161
4,F,M,1,1,2,2,21876


In [0]:
#storing dataset in Xnew
Xnew = np.array(data_new_test)
Xnew[0:5]

array([['F', 'S', 0, 2, 0, 5, 86931],
       ['M', 'M', 1, 2, 2, 4, 100125],
       ['F', 'M', 1, 2, 0, 4, 103985],
       ['M', 'M', 1, 0, 0, 4, 127161],
       ['F', 'M', 1, 1, 2, 2, 21876]], dtype=object)

In [0]:
#label encoding on Sex and Marital Status
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
Xnew[:,0] = labelencoder_X.fit_transform(Xnew[:,0])
Xnew[:,0]

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
Xnew[:,1] = labelencoder_X.fit_transform(Xnew[:,1])
Xnew[:,1]

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,

In [0]:
# predicting values for Xnew
y_pred = rf.predict(Xnew)

In [0]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,

In [0]:
final_data = pd.DataFrame({'CustomerID':CustomerID, 'BikeBuyer':y_pred})
final_data.head()

Unnamed: 0,CustomerID,BikeBuyer
0,18988,0
1,29135,0
2,12156,0
3,13749,0
4,27780,0


In [0]:
final_data.to_csv('gdrive/My Drive/Colab Notebooks/challenge2.2.csv')