<a href="https://colab.research.google.com/github/FMularski/MachineLearningBootcamp/blob/master/supervised/01_basics/01_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'0.22.2.post1'

In [36]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [3]:
df = df_raw.copy()    # always have a copy
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 368.0+ bytes


In [37]:
for col in ['size', 'color', 'gender', 'bought']:
  df[col] = df[col].astype('category')            # zmiana zmiennych object na typ kategoryczny

df['weight'] = df['weight'].astype('float')       # zmiana int na float

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 628.0 bytes


In [5]:
df.describe() # domyslnie wyswietla tylko wartosci numeryczne

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,5.0,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,408.0,75.299402,300.0,380.0,410.0,450.0,500.0


In [7]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
size,5,3,M,2
color,5,3,red,2
gender,5,2,female,3
bought,5,2,yes,3


In [8]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()   # klasa stosowana do kodowania 
le.fit(df['bought'])
le.transform(df['bought'])    # albo krocej le.fit_transform(df['bought']), mapowanie [yes, no, yes, no, yes] na [1, 0, 1, 0, 1]    ODWROTNOSC: le.inverse_transform(...)

array([1, 0, 1, 0, 1])

In [11]:
le.classes_

array(['no', 'yes'], dtype=object)

In [38]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.650274,1.221789,1
1,L,green,male,-0.618853,0.557773,0
2,M,blue,male,-0.412568,-1.434274,1
3,L,green,female,0.206284,-0.371849,0
4,M,red,female,-0.825137,0.026561,1


In [18]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder.fit(df[['size']])
encoder.transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [16]:
encoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

In [19]:
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(df[['size']])
encoder.transform(df[['size']])

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [20]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,380.0,0
4,M,red,female,79.0,410.0,1


In [39]:
pd.get_dummies(df)    # to samo co OneHotEncoder mozna zrobic za pomoca pd.get_dummies()

Unnamed: 0,price,weight,bought,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male
0,1.650274,1.221789,1,0,0,1,0,0,1,1,0
1,-0.618853,0.557773,0,1,0,0,0,1,0,0,1
2,-0.412568,-1.434274,1,0,1,0,1,0,0,0,1
3,0.206284,-0.371849,0,1,0,0,0,1,0,1,0
4,-0.825137,0.026561,1,0,1,0,0,0,1,1,0


In [29]:
pd.get_dummies(data=df, drop_first=True)  # usuwa zbędne pierwsze kolumny z kazdej kategorii, nie potrzeba ich do odczytania kompletu informacji

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0


In [24]:
pd.get_dummies(data=df, drop_first=True, columns=['size'])  # mozna kodowac tez tylko wybrane kolumny

Unnamed: 0,color,gender,price,weight,bought,size_M,size_XL
0,red,female,199.0,500.0,1,0,1
1,green,male,89.0,450.0,0,0,0
2,blue,male,99.0,300.0,1,1,0
3,green,female,129.0,380.0,0,0,0
4,red,female,79.0,410.0,1,1,0


In [34]:
# standaryzacja danych numerycznych

df['price'] = (df['price'] - df['price'].mean()) / df['price'].std()
df['weight'] = (df['weight'] - df['weight'].mean()) / df['weight'].std()

In [35]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.650274,1.221789,1
1,L,green,male,-0.618853,0.557773,0
2,M,blue,male,-0.412568,-1.434274,1
3,L,green,female,0.206284,-0.371849,0
4,M,red,female,-0.825137,0.026561,1


In [40]:
# zaleca sie standaryzacje klasa StandardScaler
from sklearn.preprocessing import StandardScaler

scaler_price = StandardScaler()
df['price'] = scaler_price.fit_transform(df[['price']])
scaler_weight = StandardScaler()
df['weight'] = scaler_weight.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.366002,1
1,L,green,male,-0.691898,0.62361,0
2,M,blue,male,-0.461266,-1.603567,1
3,L,green,female,0.230633,-0.41574,0
4,M,red,female,-0.922531,0.029696,1


In [41]:
scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.366002,1
1,L,green,male,-0.691898,0.62361,0
2,M,blue,male,-0.461266,-1.603567,1
3,L,green,female,0.230633,-0.41574,0
4,M,red,female,-0.922531,0.029696,1


In [43]:
# powtorzenie od nowa wszystkiego

df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [44]:
le = LabelEncoder()

df['bought'] = le.fit_transform(df['bought'])     # 1) kodowanie zmiennej docelowej

scaler = StandardScaler()                         # 2) standaryzacja zmiennych numerycznych
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])

df = pd.get_dummies(data=df, drop_first=True)     # 3) kodowanie zmiennych kategorycznych

df

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0
