<a href="https://colab.research.google.com/github/BiharaCD/MLOM/blob/main/FDM_Mini_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np


# Reproducibility
np.random.seed(42)


# Create dataset
data = {
"CustomerID": range(1, 101),
"Age": np.random.randint(18, 60, size=100),
"Income": np.random.randint(20000, 120000, size=100).astype(float),
"Gender": np.random.choice(["M", "F", "Non-binary"], size=100),
"SpendingScore": np.random.randint(1, 100, size=100),
"Purchased": np.random.choice([0, 1], size=100) # 0 = No, 1 = Yes
}


df = pd.DataFrame(data)


# Introduce missing values
df.loc[np.random.choice(df.index, 8, replace=False), "Income"] = np.nan
df.loc[np.random.choice(df.index, 6, replace=False), "Age"] = np.nan


# Introduce noisy/outlier values
df.loc[np.random.choice(df.index, 3, replace=False), "Income"] = -5000 # invalid income


print(df.head(10))

   CustomerID   Age    Income      Gender  SpendingScore  Purchased
0           1  56.0   28392.0           M             38          1
1           2  46.0   50535.0           M             51          1
2           3  32.0   98603.0           M             54          1
3           4  25.0   72256.0  Non-binary              8          0
4           5  38.0  109135.0           F             27          1
5           6  56.0   55222.0           F             27          1
6           7  36.0   97373.0           M             98          1
7           8  40.0   99575.0           F             21          0
8           9  28.0  116354.0           F             30          1
9          10  28.0  104651.0  Non-binary             97          0


In [2]:
# Fill missing Age with mean
df["Age"].fillna(df["Age"].mean(), inplace=True)


# Fill missing Income with median
df["Income"].fillna(df["Income"].median(), inplace=True)


# Replace invalid (negative) Income values with median
df.loc[df["Income"] < 0, "Income"] = df["Income"].median()


print(df.head(10))

   CustomerID   Age    Income      Gender  SpendingScore  Purchased
0           1  56.0   28392.0           M             38          1
1           2  46.0   50535.0           M             51          1
2           3  32.0   98603.0           M             54          1
3           4  25.0   72256.0  Non-binary              8          0
4           5  38.0  109135.0           F             27          1
5           6  56.0   55222.0           F             27          1
6           7  36.0   97373.0           M             98          1
7           8  40.0   99575.0           F             21          0
8           9  28.0  116354.0           F             30          1
9          10  28.0  104651.0  Non-binary             97          0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Income"].fillna(df["Income"].median(), inplace=True)


In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Min-Max Normalization
df["Income_norm"] = MinMaxScaler().fit_transform(df[["Income"]])


# Z-score Standardization
df["Age_zscore"] = StandardScaler().fit_transform(df[["Age"]])


# Discretization of Age
df["Age_group"] = pd.cut(df["Age"], bins=[17, 30, 45, 60], labels=["Youth", "Adult", "Senior"])


# Encoding categorical variable
df = pd.get_dummies(df, columns=["Gender"], drop_first=True)

In [5]:
# Drop irrelevant column
df_reduced = df.drop(columns=["CustomerID"])


# PCA Example
from sklearn.decomposition import PCA


features = ["Age", "Income", "SpendingScore"]
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df[features])
df[["PCA1", "PCA2"]] = df_pca

print (df.head(10))

   CustomerID   Age    Income  SpendingScore  Purchased  Income_norm  \
0           1  56.0   28392.0             38          1     0.083022   
1           2  46.0   50535.0             51          1     0.307596   
2           3  32.0   98603.0             54          1     0.795101   
3           4  25.0   72256.0              8          0     0.527890   
4           5  38.0  109135.0             27          1     0.901917   
5           6  56.0   55222.0             27          1     0.355132   
6           7  36.0   97373.0             98          1     0.782627   
7           8  40.0   99575.0             21          0     0.804959   
8           9  28.0  116354.0             30          1     0.975132   
9          10  28.0  104651.0             97          0     0.856440   

   Age_zscore Age_group  Gender_M  Gender_Non-binary          PCA1       PCA2  
0    1.505572    Senior      True              False -47659.541919  -9.346769  
1    0.656599    Senior      True              

In [6]:
region_data = pd.DataFrame({
"CustomerID": range(1, 101),
"Region": np.random.choice(["North", "South", "East", "West"], size=100)
})


# Merge datasets
df = df.merge(region_data, on="CustomerID")


print(df.head())

   CustomerID   Age    Income  SpendingScore  Purchased  Income_norm  \
0           1  56.0   28392.0             38          1     0.083022   
1           2  46.0   50535.0             51          1     0.307596   
2           3  32.0   98603.0             54          1     0.795101   
3           4  25.0   72256.0              8          0     0.527890   
4           5  38.0  109135.0             27          1     0.901917   

   Age_zscore Age_group  Gender_M  Gender_Non-binary          PCA1       PCA2  \
0    1.505572    Senior      True              False -47659.541919  -9.346769   
1    0.656599    Senior      True              False -25516.540396   2.379086   
2   -0.531963     Adult      True              False  22551.460615   2.405340   
3   -1.126243     Youth     False               True  -3795.541850 -41.532125   
4   -0.022579     Adult     False              False  33083.458274 -25.491518   

  Region  
0  South  
1   East  
2   East  
3  South  
4   West  
