<a href="https://colab.research.google.com/github/EvilMorty13/Python-Learning-BS/blob/main/Chapter_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Cleaning Part 1

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "Name":["Maria","Luka","Chris",np.nan],
    "Age":[25,27,44,35],
    "Occupation":["Enginner","Competitive Eater","Artist","Scientist"]
})

df.isnull()

Unnamed: 0,Name,Age,Occupation
0,False,False,False
1,False,False,False
2,False,False,False
3,True,False,False


In [None]:
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,Name,Age,Occupation
0,Maria,25,Enginner
1,Luka,27,Competitive Eater
2,Chris,44,Artist


##Data Cleaning Part 2

In [None]:
from scipy import stats
df2 = pd.DataFrame({
    "Transaction_Amount":[100,150,200,250,300,9999]
})

df2["Z_Score"]=stats.zscore(df2["Transaction_Amount"])

In [None]:
df2

Unnamed: 0,Transaction_Amount,Z_Score
0,100,-0.474523
1,150,-0.460833
2,200,-0.447144
3,250,-0.433454
4,300,-0.419765
5,9999,2.235719


In [None]:
df2["Winsorized_Amount"] = stats.mstats.winsorize(df2["Transaction_Amount"],limits=[0.05,0.05])

In [None]:
df2

Unnamed: 0,Transaction_Amount,Z_Score,Winsorized_Amount
0,100,-0.474523,100
1,150,-0.460833,150
2,200,-0.447144,200
3,250,-0.433454,250
4,300,-0.419765,300
5,9999,2.235719,9999


In [None]:
threshold = 500
df_truncate = df2[df2["Transaction_Amount"]<=threshold]
df_truncate

Unnamed: 0,Transaction_Amount,Z_Score,Winsorized_Amount
0,100,-0.474523,100
1,150,-0.460833,150
2,200,-0.447144,200
3,250,-0.433454,250
4,300,-0.419765,300


##Label Encoder and Data Transformation Part 1

In [None]:
from sklearn.preprocessing import LabelEncoder
df3 = pd.DataFrame({"Rating":["Low","Medium","High","Medium","Low"]})

In [None]:
le = LabelEncoder()

In [None]:
df3["Encoded_Ratings"] = le.fit_transform(df3["Rating"])

In [None]:
df3

Unnamed: 0,Rating,Encoded_Ratings
0,Low,1
1,Medium,2
2,High,0
3,Medium,2
4,Low,1


In [None]:
pd.get_dummies(df3["Rating"],drop_first=False)

Unnamed: 0,High,Low,Medium
0,False,True,False
1,False,False,True
2,True,False,False
3,False,False,True
4,False,True,False


##Label Encoder and Data Transformation Part 2

In [None]:
from sklearn.preprocessing import MinMaxScaler

df4 = pd.DataFrame({"Value":[10,20,30,40,50]})

scaler = MinMaxScaler()

In [None]:
df4["Scaled_Values"]=scaler.fit_transform(df4["Value"].values.reshape(-1,1))

In [None]:
df4

Unnamed: 0,Value,Scaled_Values
0,10,0.0
1,20,0.25
2,30,0.5
3,40,0.75
4,50,1.0


##Label Encoder and Data Transformation Part 3

In [None]:
df5 = pd.DataFrame({"age":[22,35,55,42,68]})

In [None]:
df5

Unnamed: 0,age
0,22
1,35
2,55
3,42
4,68


In [None]:
bins=[0,30,50,100]
labels=["Young","Middle-aged","Senior"]

In [None]:
df5["bin_age"]=pd.cut(df5["age"],bins=bins,labels=labels)

In [None]:
df5

Unnamed: 0,age,bin_age
0,22,Young
1,35,Middle-aged
2,55,Senior
3,42,Middle-aged
4,68,Senior


In [None]:
from sklearn.preprocessing import PolynomialFeatures
df6 = pd.DataFrame({"size": [500, 700, 900], "rooms": [2, 3, 4]})
poly_f = PolynomialFeatures(degree=2, include_bias=False)
df6_poly = pd.DataFrame(poly_f.fit_transform(df6), columns=poly_f.get_feature_names_out(df6.columns))

In [None]:
df6_poly

Unnamed: 0,size,rooms,size^2,size rooms,rooms^2
0,500.0,2.0,250000.0,1000.0,4.0
1,700.0,3.0,490000.0,2100.0,9.0
2,900.0,4.0,810000.0,3600.0,16.0


##Handling Imbalanced Data

In [None]:
from sklearn.utils import resample

df7 = pd.DataFrame({"class":[0,0,1,0,0,1,0],"feature":[1,2,3,4,5,6,7]})

In [None]:
df7_majority = df7[df7["class"]==0]

In [None]:
df7_majority

Unnamed: 0,class,feature
0,0,1
1,0,2
3,0,4
4,0,5
6,0,7


In [None]:
df7_minority = df7[df7["class"]==1]

In [None]:
df7_minority

Unnamed: 0,class,feature
2,1,3
5,1,6


In [None]:
df7_undersampled = resample(df7_majority,replace=False, n_samples=len(df7_minority),random_state=42)

In [None]:
df7_undersampled

Unnamed: 0,class,feature
1,0,2
6,0,7


In [None]:
balanced_data = pd.concat([df7_minority,df7_undersampled])

In [None]:
balanced_data

Unnamed: 0,class,feature
2,1,3
5,1,6
1,0,2
6,0,7


In [None]:
df7_oversampled = resample(df7_minority,replace=True,n_samples=len(df7_majority),random_state=42)

In [None]:
df7_oversampled

Unnamed: 0,class,feature
2,1,3
5,1,6
2,1,3
2,1,3
2,1,3
