In [10]:
import pandas as pd
import kagglehub

data = kagglehub.dataset_download("jahias/microsoft-adventure-works-cycles-customer-data",path="AWCustomers.csv")
df = pd.read_csv(data)

In [11]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18361 entries, 0 to 18360
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CustomerID            18361 non-null  int64 
 1   Title                 101 non-null    object
 2   FirstName             18361 non-null  object
 3   MiddleName            10572 non-null  object
 4   LastName              18361 non-null  object
 5   Suffix                3 non-null      object
 6   AddressLine1          18361 non-null  object
 7   AddressLine2          311 non-null    object
 8   City                  18361 non-null  object
 9   StateProvinceName     18361 non-null  object
 10  CountryRegionName     18361 non-null  object
 11  PostalCode            18361 non-null  object
 12  PhoneNumber           18361 non-null  object
 13  BirthDate             18361 non-null  object
 14  Education             18361 non-null  object
 15  Occupation            18361 non-null

Part I:

In [21]:
# a)
data = df[['Education','Occupation','Gender','MaritalStatus','HomeOwnerFlag','NumberCarsOwned','TotalChildren','YearlyIncome']]

# b)
df_new = data.copy()

# c)
# Education -> Discrete (Ordinal)
# Occupation -> Discrete (Nominal)
# Gender -> Discrete (Nominal)
# MaritalStatus -> Discrete (Nominal)
# HomeOwnerFlag -> Discrete (Binary Nominal)
# NumberCarsOwned -> Discrete (Ordinal)
# TotalChildren -> Discrete (Ordinal)
# YearlyIncome -> Continuous (Ratio)

Part II:

In [24]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

# a)
imputer = SimpleImputer(strategy='mean')
numerical_cols = ['HomeOwnerFlag','NumberCarsOwned','TotalChildren','YearlyIncome']
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# b)
scaler = MinMaxScaler()
data['YearlyIncome'] = scaler.fit_transform(data[['YearlyIncome']])

# c)
kbins_income = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
data['YearlyIncome'] = kbins_income.fit_transform(data[['YearlyIncome']])

# d)
std_scaler = StandardScaler()
data['YearlyIncome'] = std_scaler.fit_transform(data[['YearlyIncome']])

# e)
categorical_features = ['Gender', 'MaritalStatus', 'Education', 'Occupation']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))

df_final = pd.concat([data.drop(columns=categorical_features), encoded_df], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['YearlyIncome'] = scaler.fit_transform(data[['YearlyIncome']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['YearlyIncome'] = kbins_income.fit_transform(data[['YearlyIncome'

Part III:

In [23]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr

# a)
obj1 = df_final.iloc[0].values.reshape(1, -1)
obj2 = df_final.iloc[1].values.reshape(1, -1)

simple_matching = 1 - np.mean(obj1 != obj2)
jaccard_sim = 1 - jaccard(obj1.flatten(), obj2.flatten())
cos_sim = cosine_similarity(obj1, obj2)[0][0]

# b)
commute_mapping = {
    '0-1 Miles': 1, '1-2 Miles': 2, '2-5 Miles': 3,
    '5-10 Miles': 4, '10+ Miles': 5
}
data['CommuteDistance_num'] = data['CommuteDistance'].map(commute_mapping)
corr, p_value = pearsonr(data['CommuteDistance_num'], data['YearlyIncome'])
print("Simple Matching Similarity:", simple_matching)
print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarity:", cos_sim)
print("Correlation (Commute Distance vs Yearly Income):", corr, " | p-value:", p_value)


KeyError: 'CommuteDistance'