In [149]:
import pandas as pd

df = pd.DataFrame({
    'Color': 6*['Red'] + 6*['Blue'] + 6*['Purple'], #18
    'Size': 4*['S'] + 8*['M'] + 3*['L'] + 3*[None], #17
    'Gender': 9*['M'] + 9*['F'], #18
    'Price': [*[40*i for i in range(1, 16+1)]] + [1500] + [2000], #18
})
df = df.sample(frac=1).reset_index(drop=True)
df.head(18)


Unnamed: 0,Color,Size,Gender,Price
0,Purple,L,F,560
1,Red,M,M,240
2,Purple,L,F,600
3,Purple,L,F,520
4,Blue,M,M,360
5,Blue,M,F,480
6,Blue,M,M,280
7,Red,S,M,80
8,Blue,M,F,440
9,Red,S,M,40


One-Hot encoding

In [150]:
df = pd.get_dummies(df, columns=['Color'], prefix='Color')
df.head()

Unnamed: 0,Size,Gender,Price,Color_Blue,Color_Purple,Color_Red
0,L,F,560,False,True,False
1,M,M,240,False,False,True
2,L,F,600,False,True,False
3,L,F,520,False,True,False
4,M,M,360,True,False,False


Mapping for binary and ordinal cols

In [151]:
df['Size'] = df['Size'].map({'S': 0, 'M': 1, 'L': 2})
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})
df.head()

Unnamed: 0,Size,Gender,Price,Color_Blue,Color_Purple,Color_Red
0,2.0,1,560,False,True,False
1,1.0,0,240,False,False,True
2,2.0,1,600,False,True,False
3,2.0,1,520,False,True,False
4,1.0,0,360,True,False,False


Detect and remove outliers (replaced by a NULL)

In [152]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

LB= Q1 - 1.5 * IQR
UB = Q3 + 1.5 * IQR
print(f'Lower bound: {LB}')
print(f'Upper bound: {UB}')

df['Price'] = df['Price'].apply(lambda x: None if not LB <= x <= UB else x)
df.head(18)


Lower bound: -300.0
Upper bound: 1060.0


Unnamed: 0,Size,Gender,Price,Color_Blue,Color_Purple,Color_Red
0,2.0,1,560.0,False,True,False
1,1.0,0,240.0,False,False,True
2,2.0,1,600.0,False,True,False
3,2.0,1,520.0,False,True,False
4,1.0,0,360.0,True,False,False
5,1.0,1,480.0,True,False,False
6,1.0,0,280.0,True,False,False
7,0.0,0,80.0,False,False,True
8,1.0,1,440.0,True,False,False
9,0.0,0,40.0,False,False,True


Missing values' filling, via the: {qualitative data -> mode} {quantitative data -> linear interpolation}

In [153]:
df['Size'].fillna(df['Size'].mode()[0], inplace=True)
df['Price'].interpolate(method='linear', inplace=True)
df.head(18)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size'].fillna(df['Size'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price'].interpolate(method='linear', inplace=True)


Unnamed: 0,Size,Gender,Price,Color_Blue,Color_Purple,Color_Red
0,2.0,1,560.0,False,True,False
1,1.0,0,240.0,False,False,True
2,2.0,1,600.0,False,True,False
3,2.0,1,520.0,False,True,False
4,1.0,0,360.0,True,False,False
5,1.0,1,480.0,True,False,False
6,1.0,0,280.0,True,False,False
7,0.0,0,80.0,False,False,True
8,1.0,1,440.0,True,False,False
9,0.0,0,40.0,False,False,True


Data range normalization [0, 1]

In [154]:
from sklearn.preprocessing import MinMaxScaler

df[['Price']] = MinMaxScaler().fit_transform(df[['Price']])
df.rename(columns={'Price': 'Pricey-ness'}, inplace=True)
df.head()

Unnamed: 0,Size,Gender,Pricey-ness,Color_Blue,Color_Purple,Color_Red
0,2.0,1,0.866667,False,True,False
1,1.0,0,0.333333,False,False,True
2,2.0,1,0.933333,False,True,False
3,2.0,1,0.8,False,True,False
4,1.0,0,0.533333,True,False,False


Adding a Class to the data using GMM

In [155]:
from sklearn.mixture import GaussianMixture
import numpy as np

x = df.copy()
GMM = GaussianMixture(n_components=2, random_state=42)
GMM.fit(x)
labels = GMM.predict(x)
df['Class'] = labels
df.head()

Unnamed: 0,Size,Gender,Pricey-ness,Color_Blue,Color_Purple,Color_Red,Class
0,2.0,1,0.866667,False,True,False,1
1,1.0,0,0.333333,False,False,True,0
2,2.0,1,0.933333,False,True,False,1
3,2.0,1,0.8,False,True,False,1
4,1.0,0,0.533333,True,False,False,0


Oversampling via SMOTE

In [156]:
x = df.drop(columns=['Class'])
y = df['Class']
df['Class'].value_counts()

Class
0    12
1     6
Name: count, dtype: int64

In [140]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x, y = smote.fit_resample(x, y)
df['Class'].value_counts()

ImportError: cannot import name '_deprecate_Xt_in_inverse_transform' from 'sklearn.utils.deprecation' (c:\Users\Asser\Documents\Coding\ML-NTI\Linear-Regression\.venv\Lib\site-packages\sklearn\utils\deprecation.py)