<a href="https://colab.research.google.com/github/AshenIsuruWijesuriya/AshenIsuruWijesuriya/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(42)

# Create dataset
data = {
    "CustomerID": range(1, 101),
    "Age": np.random.randint(18, 60, size=100),
    "Income": np.random.randint(20000, 120000, size=100).astype(float),
    "Gender": np.random.choice(["Male", "Female", "Other"], size=100),
    "Purchased": np.random.choice([0, 1], size=100)  # 0 = No, 1 = Yes
}

df = pd.DataFrame(data)

# Introduce some missing values
df.loc[np.random.choice(df.index, 10, replace=False), "Income"] = np.nan
df.loc[np.random.choice(df.index, 5, replace=False), "Age"] = np.nan

# Introduce noisy/outlier values
df.loc[np.random.choice(df.index, 3, replace=False), "Income"] = -9999  # Invalid income

print(df.head(10))


   CustomerID   Age    Income  Gender  Purchased
0           1  56.0   28392.0    Male          1
1           2  46.0   -9999.0    Male          1
2           3  32.0   98603.0    Male          0
3           4  25.0   72256.0   Other          0
4           5  38.0  109135.0  Female          1
5           6   NaN   55222.0  Female          1
6           7  36.0   97373.0    Male          0
7           8  40.0   99575.0  Female          0
8           9  28.0  116354.0  Female          1
9          10  28.0  104651.0   Other          0


In [7]:
# Fill missing Age with mean
df["Age"].fillna(df["Age"].mean(), inplace=True)

# Fill missing Income with median
df["Income"].fillna(df["Income"].median(), inplace=True)
print(df.head(10))


   CustomerID        Age    Income  Gender  Purchased
0           1  56.000000   28392.0    Male          1
1           2  46.000000   -9999.0    Male          1
2           3  32.000000   98603.0    Male          0
3           4  25.000000   72256.0   Other          0
4           5  38.000000  109135.0  Female          1
5           6  37.715789   55222.0  Female          1
6           7  36.000000   97373.0    Male          0
7           8  40.000000   99575.0  Female          0
8           9  28.000000  116354.0  Female          1
9          10  28.000000  104651.0   Other          0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Income"].fillna(df["Income"].median(), inplace=True)


In [5]:
print(df.head(10))

   CustomerID        Age    Income  Gender  Purchased
0           1  56.000000   28392.0    Male          1
1           2  46.000000   -9999.0    Male          1
2           3  32.000000   98603.0    Male          0
3           4  25.000000   72256.0   Other          0
4           5  38.000000  109135.0  Female          1
5           6  37.715789   55222.0  Female          1
6           7  36.000000   97373.0    Male          0
7           8  40.000000   99575.0  Female          0
8           9  28.000000  116354.0  Female          1
9          10  28.000000  104651.0   Other          0


In [9]:
# Replace negative Income values with median
df.loc[df["Income"] < 0, "Income"] = df["Income"].median()


In [10]:
print(df.head(10))

   CustomerID        Age    Income  Gender  Purchased
0           1  56.000000   28392.0    Male          1
1           2  46.000000   74268.0    Male          1
2           3  32.000000   98603.0    Male          0
3           4  25.000000   72256.0   Other          0
4           5  38.000000  109135.0  Female          1
5           6  37.715789   55222.0  Female          1
6           7  36.000000   97373.0    Male          0
7           8  40.000000   99575.0  Female          0
8           9  28.000000  116354.0  Female          1
9          10  28.000000  104651.0   Other          0


In [11]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df["Income_norm"] = scaler.fit_transform(df[["Income"]])

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df["Age_zscore"] = scaler.fit_transform(df[["Age"]])


In [13]:
df["Age_group"] = pd.cut(df["Age"], bins=[17, 30, 45, 60], labels=["Youth", "Adult", "Senior"])


In [14]:
df_reduced = df.drop(columns=["CustomerID"])


In [15]:
from sklearn.decomposition import PCA

features = ["Age", "Income"]
pca = PCA(n_components=1)
df["PCA_Feature"] = pca.fit_transform(df[features])


In [16]:
print(df.head(10))

   CustomerID        Age    Income  Gender  Purchased  Income_norm  \
0           1  56.000000   28392.0    Male          1     0.083022   
1           2  46.000000   74268.0    Male          1     0.548296   
2           3  32.000000   98603.0    Male          0     0.795101   
3           4  25.000000   72256.0   Other          0     0.527890   
4           5  38.000000  109135.0  Female          1     0.901917   
5           6  37.715789   55222.0  Female          1     0.355132   
6           7  36.000000   97373.0    Male          0     0.782627   
7           8  40.000000   99575.0  Female          0     0.804959   
8           9  28.000000  116354.0  Female          1     0.975132   
9          10  28.000000  104651.0   Other          0     0.856440   

   Age_zscore Age_group   PCA_Feature  
0    1.563534    Senior -45639.700609  
1    0.708406    Senior    236.299712  
2   -0.488773     Adult  24571.300184  
3   -1.087363     Youth  -1775.699557  
4    0.024304     Adult  3510

In [17]:
region_data = pd.DataFrame({
    "CustomerID": range(1, 101),
    "Region": np.random.choice(["North", "South", "East", "West"], size=100)
})

df = df.merge(region_data, on="CustomerID")


In [19]:
print(df.head(10))

   CustomerID        Age    Income  Gender  Purchased  Income_norm  \
0           1  56.000000   28392.0    Male          1     0.083022   
1           2  46.000000   74268.0    Male          1     0.548296   
2           3  32.000000   98603.0    Male          0     0.795101   
3           4  25.000000   72256.0   Other          0     0.527890   
4           5  38.000000  109135.0  Female          1     0.901917   
5           6  37.715789   55222.0  Female          1     0.355132   
6           7  36.000000   97373.0    Male          0     0.782627   
7           8  40.000000   99575.0  Female          0     0.804959   
8           9  28.000000  116354.0  Female          1     0.975132   
9          10  28.000000  104651.0   Other          0     0.856440   

   Age_zscore Age_group   PCA_Feature Region  
0    1.563534    Senior -45639.700609   West  
1    0.708406    Senior    236.299712   West  
2   -0.488773     Adult  24571.300184  South  
3   -1.087363     Youth  -1775.699557   W