In [9]:
import numpy as np
import pandas as pd


(a) Examine the values of each attribute and Select a set of attributes only that would affect to predict 
future bike buyers to create your input for data mining algorithms. Remove all the unnecessary 
attributes. (Select features just by analysis). 
(b) Create a new Data Frame with the selected attributes only. 

(c) Determine a Data value type (Discrete, or Continuous, then Nominal, Ordinal, Interval, Ratio) of 
each attribute in your selection to identify preprocessing tasks to create input for your data mining. 

In [10]:
customers = pd.read_csv("AWCustomers.csv")
sales = pd.read_csv("AWSales.csv")

dfm = pd.merge(customers, sales, on="CustomerID")
dfm['BirthDate'] = pd.to_datetime(dfm['BirthDate'], errors='coerce')
dfm['Age'] = (pd.to_datetime("today") - dfm['BirthDate']).dt.days // 365

cols = ["Age","Education","Occupation","Gender","MaritalStatus","HomeOwnerFlag",
        "NumberCarsOwned","NumberChildrenAtHome","TotalChildren","YearlyIncome","BikeBuyer"]
df = dfm[cols].copy()
df.head()


Unnamed: 0,Age,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer
0,37,Bachelors,Clerical,M,M,1,3,0,1,81916,1
1,53,Partial College,Clerical,M,M,1,2,1,2,81076,1
2,39,Bachelors,Clerical,F,S,0,3,0,0,86387,1
3,47,Partial College,Skilled Manual,M,M,1,2,1,2,61481,1
4,50,Partial College,Skilled Manual,M,S,1,1,0,0,51804,1


In [11]:
print(df.isnull().sum())

df = df.fillna({
    'YearlyIncome': df['YearlyIncome'].median(),
    'Education': df['Education'].mode()[0],
    'Occupation': df['Occupation'].mode()[0],
    'Age': df['Age'].median()
})

print(df.isnull().sum())

Age                     0
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
BikeBuyer               0
dtype: int64
Age                     0
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
BikeBuyer               0
dtype: int64


In [12]:
from sklearn.preprocessing import MinMaxScaler

num_cols = ['Age','YearlyIncome','HomeOwnerFlag','NumberCarsOwned','NumberChildrenAtHome','TotalChildren']
mm = MinMaxScaler()
df_minmax = pd.DataFrame(mm.fit_transform(df[num_cols]), columns=[c+"_minmax" for c in num_cols], index=df.index)
df = pd.concat([df, df_minmax], axis=1)

In [13]:
df['IncomeBin'] = pd.qcut(df['YearlyIncome'], q=4, labels=["Low","Medium","High","Very High"])


In [14]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
df_std = pd.DataFrame(ss.fit_transform(df[num_cols]), columns=[c+"_std" for c in num_cols], index=df.index)
df = pd.concat([df, df_std], axis=1)

In [15]:
cat_cols = ['Education','Occupation','Gender','MaritalStatus','IncomeBin']
df_ohe = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_ohe.head()

Unnamed: 0,Age,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,Age_minmax,YearlyIncome_minmax,HomeOwnerFlag_minmax,...,Education_Partial High School,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_M,MaritalStatus_S,IncomeBin_Medium,IncomeBin_High,IncomeBin_Very High
0,37,1,3,0,1,81916,1,0.183099,0.496842,1.0,...,False,False,False,False,False,True,False,False,True,False
1,53,1,2,1,2,81076,1,0.408451,0.489453,1.0,...,False,False,False,False,False,True,False,False,True,False
2,39,0,3,0,0,86387,1,0.211268,0.536172,0.0,...,False,False,False,False,False,False,True,False,True,False
3,47,1,2,1,2,61481,1,0.323944,0.317083,1.0,...,False,False,False,False,True,True,False,True,False,False
4,50,1,1,0,0,51804,1,0.366197,0.231958,1.0,...,False,False,False,False,True,True,True,False,False,False


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
import numpy as np

num_std_cols = [c for c in df_ohe.columns if c.endswith('_std')]
bin_cols = [c for c in df_ohe.columns if c.startswith(('Education_','Occupation_','Gender_','MaritalStatus_','IncomeBin_'))]

x_num = df_ohe.loc[df_ohe.index[0], num_std_cols].values.reshape(1,-1)
y_num = df_ohe.loc[df_ohe.index[1], num_std_cols].values.reshape(1,-1)
cos_sim = float(cosine_similarity(x_num, y_num)[0,0])

x_bin = df_ohe.loc[df_ohe.index[0], bin_cols].astype(bool).values
y_bin = df_ohe.loc[df_ohe.index[1], bin_cols].astype(bool).values
jacc_sim = 1 - jaccard(x_bin, y_bin)
sm_sim = (x_bin == y_bin).mean()

print("Cosine Similarity (numeric):", cos_sim)
print("Jaccard Similarity (binary):", jacc_sim)
print("Simple Matching Similarity (binary):", sm_sim)


Cosine Similarity (numeric): 0.2526926842190864
Jaccard Similarity (binary): 0.6666666666666667
Simple Matching Similarity (binary): 0.9230769230769231


In [17]:
if 'CommuteDistance' in customers.columns:
    mapping = {"0-1 Miles":1, "1-2 Miles":2, "2-5 Miles":3, "5-10 Miles":4, "10+ Miles":5}
    customers['CommuteDistance'] = customers['CommuteDistance'].map(mapping)
    tmp = customers[['CustomerID','CommuteDistance','YearlyIncome']].dropna()
    print(tmp[['CommuteDistance','YearlyIncome']].corr(method='pearson'))
else:
    print("CommuteDistance not available in dataset.")

CommuteDistance not available in dataset.
