In [1]:
import pandas as pd

# Load dataset (after downloading)
df = pd.read_csv("AWCustomers.csv")

# Select relevant attributes
features = ['MaritalStatus','Gender','YearlyIncome','TotalChildren',
            'NumberChildrenAtHome','Education','Occupation',
            'HomeOwnerFlag','NumberCarsOwned']

df_selected = df[features]
print(df_selected.head())


  MaritalStatus Gender  YearlyIncome  TotalChildren  NumberChildrenAtHome  \
0             M      M         81916              1                     0   
1             M      M         81076              2                     1   
2             S      F         86387              0                     0   
3             M      M         61481              2                     1   
4             S      M         51804              0                     0   

         Education      Occupation  HomeOwnerFlag  NumberCarsOwned  
0        Bachelors        Clerical              1                3  
1  Partial College        Clerical              1                2  
2        Bachelors        Clerical              0                3  
3  Partial College  Skilled Manual              1                2  
4  Partial College  Skilled Manual              1                1  


In [None]:
print(df_selected.isnull().sum())
df_selected = df_selected.dropna()

MaritalStatus           0
Gender                  0
YearlyIncome            0
TotalChildren           0
NumberChildrenAtHome    0
Education               0
Occupation              0
HomeOwnerFlag           0
NumberCarsOwned         0
dtype: int64


In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_selected[['YearlyIncome','TotalChildren','NumberChildrenAtHome','NumberCarsOwned']] = \
    scaler.fit_transform(df_selected[['YearlyIncome','TotalChildren','NumberChildrenAtHome','NumberCarsOwned']])


In [4]:
df_selected['Income_Bin'] = pd.qcut(df['YearlyIncome'], q=4, labels=["Low","Medium","High","Very High"])


In [5]:
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
df_selected[['YearlyIncome']] = scaler_std.fit_transform(df_selected[['YearlyIncome']])

In [6]:
df_transformed = pd.get_dummies(df_selected,
                                columns=['MaritalStatus','Gender','Education','Occupation'],
                                drop_first=True)
print(df_transformed.head())


   YearlyIncome  TotalChildren  NumberChildrenAtHome  HomeOwnerFlag  \
0      0.298555       0.333333              0.000000              1   
1      0.271180       0.666667              0.333333              1   
2      0.444261       0.000000              0.000000              0   
3     -0.367401       0.666667              0.333333              1   
4     -0.682765       0.000000              0.000000              1   

   NumberCarsOwned Income_Bin  MaritalStatus_S  Gender_M  \
0              0.6       High            False      True   
1              0.4       High            False      True   
2              0.6       High             True     False   
3              0.4     Medium            False      True   
4              0.2        Low             True      True   

   Education_Graduate Degree  Education_High School  \
0                      False                  False   
1                      False                  False   
2                      False                  F

In [7]:
# One-Hot Encode all categorical columns
df_encoded = pd.get_dummies(df_selected, drop_first=True)

# Now pick two objects
obj1 = df_encoded.iloc[0].values.reshape(1,-1)
obj2 = df_encoded.iloc[1].values.reshape(1,-1)

# Cosine similarity (now works)
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(obj1, obj2)[0][0]
print("Cosine Similarity:", cos_sim)



Cosine Similarity: 0.8580785635350737


In [8]:
corr = df['YearlyIncome'].corr(df['HomeOwnerFlag'].astype('category').cat.codes)
print("Correlation:", corr)


Correlation: 0.35624339008110406
