In [1]:
import pandas as pd
import numpy as np
import sklearn

### 1. Data

In [2]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [3]:
df = df_raw.copy()

### 2. Sprawdzanie braków

In [6]:
# maska logiczna - True - wartość Nan

df.isnull()

Unnamed: 0,size,color,gender,price,weight,bought
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,True,False,False
3,True,False,False,False,True,False
4,False,False,False,False,False,False
5,False,False,False,False,True,False


In [7]:
# sumowanie NaN
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [8]:
# całkowita liczba NaN
df.isnull().sum().sum()

5

In [12]:
# procentowy brak danych
"""
to jest bardzo ważne, czasami jeżeli braki są powyżej 10%, 20% taką zmienną należy pominąćm albo zastosować inne metody 
uzupełniania braków danych
"""

df.isnull().sum() / len(df) * 100

size      16.666667
color      0.000000
gender    16.666667
price     16.666667
weight    33.333333
bought     0.000000
dtype: float64

### 3. Uzupełnianie braków - klasa SimpleImputer

In [13]:
from sklearn.impute import SimpleImputer

In [17]:
df['weight']

0    500.0
1    450.0
2    300.0
3      NaN
4    410.0
5      NaN
Name: weight, dtype: float64

In [19]:
# strategy: 'mean', 'median', 'most_frequent', 'constant'

imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean') # missing values - numpy NaN, metoda średnia
imputer.fit (df[['weight']])

SimpleImputer()

In [18]:
# wartość średniej, która została obliczona:

imputer.statistics_

array([415.])

In [23]:
imputer.transform(df[['weight']])

df['weight'] = imputer.transform(df[['weight']])

In [24]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


#### - uzupełnianie braków - zmienna 'price', strategy = 'constant'

In [26]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)
imputer.fit_transform(df[['price']])

array([[199.],
       [ 89.],
       [ 99.],
       [129.],
       [ 79.],
       [ 89.]])

#### - uzupełnianie braków - zmienna 'size' - kategoryczna

In [28]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='L')

imputer.fit_transform(df[['size']])

array([['XL'],
       ['L'],
       ['M'],
       ['L'],
       ['M'],
       ['M']], dtype=object)

#### - uzupełnianie braków - strategy = 'most_freuqent'

In [30]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit_transform(df[['size']])

array([['XL'],
       ['L'],
       ['M'],
       ['M'],
       ['M'],
       ['M']], dtype=object)

### 4. Uzupełnianie braków - pandas

In [31]:
df= df_raw.copy()

In [32]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [33]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [36]:
df.isnull().sum() / len(df)

size      0.166667
color     0.000000
gender    0.166667
price     0.166667
weight    0.333333
bought    0.000000
dtype: float64

#### - zwracanie maski logicznej - można za pomocą isnull i notnull!!

In [37]:
pd.isnull(df['weight'])

0    False
1    False
2    False
3     True
4    False
5     True
Name: weight, dtype: bool

#### - wycinanie wierszy z wartością null w zmiennej weight

In [38]:
df[pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
3,,green,female,129.0,,no
5,M,green,male,89.0,,no


#### - wycinanie wierszy bez NaN

In [40]:
df[~pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


#### 5. Uzupełnianie braków w pandas - fillna

In [41]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [42]:
df.fillna('brak')

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,brak,brak,300.0,yes
3,brak,green,female,129.0,brak,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,brak,no


In [43]:
df.fillna(0)

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,0,0.0,300.0,yes
3,0,green,female,129.0,0.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,0.0,no


In [44]:
df['size'].fillna('L')

0    XL
1     L
2     M
3     L
4     M
5     M
Name: size, dtype: object

### 6. Usuwanie wierszy z brakami


In [45]:
df.dropna()

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes


In [52]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [53]:
# usuwanie wierszy, gdy jest więcej niż jeden NaN

df.dropna(thresh = 5) # musi być 5 kolumn bez NaN, usuwane są wiersze z 2 NaN

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no
