<a href="https://colab.research.google.com/github/BartekNice31/machine_learning_bootcamp/blob/main/supervised/01_basics/01_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```
Kurs stworzony w oparciu o wersję `0.22.1`

### Preprocessing danych:
1. [Import bibliotek](#0)
2. [Wygenerowanie danych](#1)
3. [Utworzenie kopii danych](#2)
4. [Zmiana typu danych i wstępna eksploracja](#3)
5. [LabelEncoder](#4)
6. [OneHotEncoder](#5)
7. [Pandas *get_dummies()*](#6)
8. [Standaryzacja - StandardScaler](#7)
9. [Przygotowanie danych do modelu](#8)



In [12]:
!pip install --upgrade scikit-learn



In [13]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder

sklearn.__version__

'1.8.0'

In [14]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [15]:
df=df_raw.copy()

In [16]:
df.select_dtypes('number')

Unnamed: 0,price,weight
0,199.0,500
1,89.0,450
2,99.0,300
3,129.0,380
4,79.0,410


In [17]:
df=df_raw.copy()

In [18]:
objects=[]
numerical=[]
for c in df.columns:
  if df_raw[c].dtype=='object':
    objects.append(c)
  elif df[c].dtype=='int' or df[c].dtype=='float':
    numerical.append(c)

In [19]:
for c in objects:
  df[c]=df[c].astype('category')

In [20]:
objects,numerical

(['size', 'color', 'gender', 'bought'], ['price', 'weight'])

In [21]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,5.0,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,408.0,75.299402,300.0,380.0,410.0,450.0,500.0


In [23]:
df.describe(include='category')

Unnamed: 0,size,color,gender,bought
count,5,5,5,5
unique,3,3,2,2
top,L,green,female,yes
freq,2,2,3,3


In [24]:
df.describe(include='category').T

Unnamed: 0,count,unique,top,freq
size,5,3,L,2
color,5,3,green,2
gender,5,2,female,3
bought,5,2,yes,3


In [25]:
label_encoder=LabelEncoder()
df['size']=label_encoder.fit_transform(df['size'])
df['color']=label_encoder.fit_transform(df['color'])
df['gender']=label_encoder.fit_transform(df['gender'])
df['bought']=label_encoder.fit_transform(df['bought'])

In [26]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,2,2,0,199.0,500,1
1,0,1,1,89.0,450,0
2,1,0,1,99.0,300,1
3,0,1,0,129.0,380,0
4,1,2,0,79.0,410,1


In [27]:
label_encoder.classes_

array(['no', 'yes'], dtype=object)

In [28]:
label_encoder.inverse_transform(df['size'])

ValueError: y contains previously unseen labels: [2]

In [29]:
df=df_raw.copy()

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
df['size'].unique()

array(['XL', 'L', 'M'], dtype=object)

In [40]:
df['size'].nunique()

3

In [37]:
encoder=OneHotEncoder(sparse_output=False)
encoder.fit_transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [35]:
encoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

In [None]:
encoder.res

In [43]:
df_encoder_size=pd.DataFrame(data=encoder.fit_transform(df[['size']]),
                             columns=['is_XL', 'is_L', 'is_M'])
df_encoder_size

Unnamed: 0,is_XL,is_L,is_M
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 372.0+ bytes


In [48]:
for ob_c in df.columns:
  if df[ob_c].dtype==object:
    print(df[ob_c].unique())

['XL' 'L' 'M']
['red' 'green' 'blue']
['female' 'male']
['yes' 'no']


In [65]:
encoder=OneHotEncoder(drop='first',sparse_output=False)

In [66]:
encoder.fit_transform(df[['size','color','gender','bought']]).shape

(5, 6)

In [71]:
df_encoder_values=pd.DataFrame(data=encoder.fit_transform(df[['size','color','gender','bought']]),
                               columns=[ 'is_l','is_m','is_green','is_blue','is_male' ,'is__bought']
                               )

In [70]:
df_encoder_values

Unnamed: 0,is_l,is_m,is_green,is_blue,is_male,is_not_bought
0,0.0,1.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0


In [73]:
df_with_encoded_values=pd.concat([df,df_encoder_values],axis=1)

In [74]:
df_with_encoded_values

Unnamed: 0,size,color,gender,price,weight,bought,is_l,is_m,is_green,is_blue,is_male,is__bought
0,XL,red,female,199.0,500,yes,0.0,1.0,0.0,1.0,0.0,1.0
1,L,green,male,89.0,450,no,0.0,0.0,1.0,0.0,1.0,0.0
2,M,blue,male,99.0,300,yes,1.0,0.0,0.0,0.0,1.0,1.0
3,L,green,female,129.0,380,no,0.0,0.0,1.0,0.0,0.0,0.0
4,M,red,female,79.0,410,yes,1.0,0.0,0.0,1.0,0.0,1.0


In [80]:
df_with_dummies=pd.concat([df,pd.get_dummies(df,drop_first=True,dtype=int)],axis=1)

In [81]:
df_with_dummies

Unnamed: 0,size,color,gender,price,weight,bought,price.1,weight.1,size_M,size_XL,color_green,color_red,gender_male,bought_yes
0,XL,red,female,199.0,500,yes,199.0,500,0,1,0,1,0,1
1,L,green,male,89.0,450,no,89.0,450,0,0,1,0,1,0
2,M,blue,male,99.0,300,yes,99.0,300,1,0,0,0,1,1
3,L,green,female,129.0,380,no,129.0,380,0,0,1,0,0,0
4,M,red,female,79.0,410,yes,79.0,410,1,0,0,1,0,1


In [85]:
pd.get_dummies(df,dtype='int')

Unnamed: 0,price,weight,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male,bought_no,bought_yes
0,199.0,500,0,0,1,0,0,1,1,0,0,1
1,89.0,450,1,0,0,0,1,0,0,1,1,0
2,99.0,300,0,1,0,1,0,0,0,1,0,1
3,129.0,380,1,0,0,0,1,0,1,0,1,0
4,79.0,410,0,1,0,0,0,1,1,0,0,1


In [87]:
pd.get_dummies(df,dtype='int',prefix='new',prefix_sep='-',drop_first=True)

Unnamed: 0,price,weight,new-M,new-XL,new-green,new-red,new-male,new-yes
0,199.0,500,0,1,0,1,0,1
1,89.0,450,0,0,1,0,1,0
2,99.0,300,1,0,0,0,1,1
3,129.0,380,0,0,1,0,0,0
4,79.0,410,1,0,0,1,0,1


In [88]:
pd.get_dummies(df,dtype='int',prefix='new',prefix_sep='-',drop_first=True).T.duplicated()

Unnamed: 0,0
price,False
weight,False
new-M,False
new-XL,False
new-green,False
new-red,False
new-male,False
new-yes,False


Standaryzacja

In [90]:
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
df=df_raw.copy()

In [91]:
standarized_numerical_values=std.fit_transform(df.select_dtypes('number'))

In [93]:
standarized_numerical_values_df=pd.DataFrame(data=standarized_numerical_values,
                                             columns=['standarized_price','standarized_weight'])

In [94]:
standarized_numerical_values_df

Unnamed: 0,standarized_price,standarized_weight
0,1.845062,1.366002
1,-0.691898,0.62361
2,-0.461266,-1.603567
3,0.230633,-0.41574
4,-0.922531,0.029696


In [95]:
(df['price']-df['price'].mean())/(df['price'].std())

Unnamed: 0,price
0,1.650274
1,-0.618853
2,-0.412568
3,0.206284
4,-0.825137


In [96]:
(df['weight']-df['weight'].mean())/(df['weight'].std())

Unnamed: 0,weight
0,1.221789
1,0.557773
2,-1.434274
3,-0.371849
4,0.026561


In [97]:
df['price'].std(),df['weight'].std()

(48.47679857416329, 75.2994023880668)

In [99]:
def standarized_columns(df,c):
  return (df[c]-df[c].mean())/(df[c].std())

In [100]:
standarized_columns(df,'price')

Unnamed: 0,price
0,1.650274
1,-0.618853
2,-0.412568
3,0.206284
4,-0.825137


In [101]:
standarized_columns(df,'weight')

Unnamed: 0,weight
0,1.221789
1,0.557773
2,-1.434274
3,-0.371849
4,0.026561


In [105]:
from sklearn.preprocessing import scale
scale(df['price']),std.fit_transform(df[['price']])

(array([ 1.84506242, -0.69189841, -0.4612656 ,  0.2306328 , -0.92253121]),
 array([[ 1.84506242],
        [-0.69189841],
        [-0.4612656 ],
        [ 0.2306328 ],
        [-0.92253121]]))

In [106]:
df[['price','weight']]=std.fit_transform(df[['price','weight']])

In [107]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.366002,yes
1,L,green,male,-0.691898,0.62361,no
2,M,blue,male,-0.461266,-1.603567,yes
3,L,green,female,0.230633,-0.41574,no
4,M,red,female,-0.922531,0.029696,yes


In [108]:
df=df_raw.copy()

In [109]:
label_encoder=LabelEncoder()
standarder_scaller=StandardScaler()
df['bought']=label_encoder.fit_transform(df['bought'])
df[['price','weight']]=standarder_scaller.fit_transform(df[['price','weight']])

In [111]:
df=pd.get_dummies(df,drop_first=True,dtype='int',prefix='new',prefix_sep='-')

In [112]:
df

Unnamed: 0,price,weight,bought,new-M,new-XL,new-green,new-red,new-male
0,1.845062,1.366002,1,0,1,0,1,0
1,-0.691898,0.62361,0,0,0,1,0,1
2,-0.461266,-1.603567,1,1,0,0,0,1
3,0.230633,-0.41574,0,0,0,1,0,0
4,-0.922531,0.029696,1,1,0,0,1,0


In [113]:
features_data=df.drop(columns='bought',axis=1)
target_data=df['bought']