In [5]:
# Adventure Works Bike Buyer Prediction - Preprocessing & Analysis


import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity

# Part I: Load & Feature Selection
df = pd.read_csv("AWCustomers.csv")

# Derive Age
df["Age"] = pd.to_datetime("today").year - pd.to_datetime(df["BirthDate"]).dt.year

# Keep only relevant predictive attributes
selected_features = [
    "Gender", "MaritalStatus", "Education", "Occupation",
    "YearlyIncome", "HomeOwnerFlag", "NumberCarsOwned",
    "NumberChildrenAtHome", "TotalChildren",
    "City", "StateProvinceName", "CountryRegionName", "Age"
]

df_selected = df[selected_features].copy()

# Part II: Preprocessing

# Define feature groups
numeric_features = ["Age", "YearlyIncome", "NumberCarsOwned",
                    "NumberChildrenAtHome", "TotalChildren"]
categorical_features = ["Gender", "MaritalStatus", "Education", "Occupation",
                        "City", "StateProvinceName", "CountryRegionName"]
binary_features = ["HomeOwnerFlag"]

# (a) Handling Null Values with SimpleImputer
num_imputer = SimpleImputer(strategy="median")
df_selected[numeric_features] = num_imputer.fit_transform(df_selected[numeric_features])

cat_imputer = SimpleImputer(strategy="most_frequent")
df_selected[categorical_features] = cat_imputer.fit_transform(df_selected[categorical_features])

bin_imputer = SimpleImputer(strategy="most_frequent")
df_selected[binary_features] = bin_imputer.fit_transform(df_selected[binary_features])

# (b) Normalization (Min-Max scaling)
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df_selected[numeric_features]),
    columns=[col + "_minmax" for col in numeric_features]
)

# (c) Discretization (bin Age & YearlyIncome into 5 bins)
df_bins = pd.DataFrame()
df_bins["Age_bin"] = pd.qcut(df_selected["Age"], q=5, labels=False)
df_bins["YearlyIncome_bin"] = pd.qcut(df_selected["YearlyIncome"], q=5, labels=False)

# (d) Standardization (Z-score scaling)
std_scaler = StandardScaler()
df_standardized = pd.DataFrame(
    std_scaler.fit_transform(df_selected[numeric_features]),
    columns=[col + "_z" for col in numeric_features]
)

# Standardize discretized bins as well
df_bins_std = pd.DataFrame(
    std_scaler.fit_transform(df_bins),
    columns=[col + "_z" for col in df_bins.columns]
)

# (e) One-Hot Encoding for categorical attributes
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_array = encoder.fit_transform(df_selected[categorical_features])
encoded_df = pd.DataFrame(
    encoded_array,
    columns=encoder.get_feature_names_out(categorical_features)
)

# Keep binary attributes as-is
df_binary = df_selected[binary_features].copy()

# Final Transformed Dataset
df_transformed = pd.concat(
    [df_normalized, df_standardized, df_bins, df_bins_std, encoded_df, df_binary],
    axis=1
)

# Save outputs
df_selected.to_csv("AW_selected_cleaned.csv", index=False)
df_transformed.to_csv("AW_transformed_matrix.csv", index=False)

# Part III: Similarity & Correlation

# Compare row 0 vs row 1
obj1 = df_transformed.iloc[0]
obj2 = df_transformed.iloc[1]

# Simple Matching Coefficient (binary features only)
binary_cols = df_binary.columns.tolist() + list(encoded_df.columns)
matches = (df_transformed.loc[0, binary_cols] == df_transformed.loc[1, binary_cols]).sum()
smc = matches / len(binary_cols)

# Jaccard Similarity (binary features only)
obj1_bin = df_transformed.loc[0, binary_cols].astype(bool).astype(int).values
obj2_bin = df_transformed.loc[1, binary_cols].astype(bool).astype(int).values
intersection = np.logical_and(obj1_bin, obj2_bin).sum()
union = np.logical_or(obj1_bin, obj2_bin).sum()
jaccard_sim = intersection / union if union != 0 else 0

# Cosine Similarity (all features)
cos_sim = cosine_similarity([obj1], [obj2])[0][0]

# Correlation: Commute Distance vs Yearly Income (if available)
if "CommuteDistance" in df.columns:
    commute_map = {
        "0-1 Miles": 0.5, "1-2 Miles": 1.5, "2-5 Miles": 3.5,
        "5-10 Miles": 7.5, "10+ Miles": 15
    }
    df["CommuteDistanceNum"] = df["CommuteDistance"].map(commute_map)
    correlation = df["CommuteDistanceNum"].corr(df["YearlyIncome"])
else:
    correlation = "CommuteDistance column not available in dataset"

# Final Results
results = {
    "Simple Matching Coefficient": smc,
    "Jaccard Similarity": jaccard_sim,
    "Cosine Similarity": cos_sim,
    "Commute Distance vs Yearly Income Correlation": correlation
}

print("==== Final Results ====")
for k, v in results.items():
    print(f"{k}: {v}")


==== Final Results ====
Simple Matching Coefficient: 0.9767441860465116
Jaccard Similarity: 0.3333333333333333
Cosine Similarity: 0.6260184121192485
Commute Distance vs Yearly Income Correlation: CommuteDistance column not available in dataset
