In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Co jak nie ma wszystkich wartości danych? (sklearn)
1. Wypełnić brakujące dane (jak jesteśmy w stanie przewidzieć te dane).
2. Usunąć wiersze z brkującymi danymi (jak mamy wystarczojącą ilość danych żeby nie zepsuć uczenia).

In [2]:
# wczytanie danych z dysku
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing # dane niekompletne tam gdzie jest NaN to brakuje pozycji

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
# wyświetlenie ile i gdzie brakuje danych (kolumna i ilość)
car_sales_missing.isna().sum() 

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## Usuwanie wierszy

In [4]:
# usunięcie wierszy tych w których brakuje ceny, i tak ich nie wymyślimy, a ze średniej nie ma sensu brać
# car_sales_missing.dropna(inplace=True) # bez wskazania na kolumnę, jeżeli były by dane we innych kolumnach to też je usunie
# usunięcie wierszy ale tylko tych gdzie brakuje w kolumnie 'Price' danych
car_sales_missing.dropna(subset=['Price'], inplace=True)

In [5]:
# wyświetlenie ile i gdzie brakuje danych (kolumna i ilość)
car_sales_missing.isna().sum() 

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [6]:
# tyle wierszy po usunięciu braków w cenach
len(car_sales_missing)

950

In [7]:
# rozdzielenie danych na wejściowe i wyjściowe (odzielenie kolumny wynikowej z ceną, ona jest numeryczna)
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price'] 

### Uzupełnienie danych (sklearn)

In [8]:
# wypełnianie przy pomocy Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
# fill categorial values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')
# define columns
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ["Odometer (KM)"]
# create imputer
imputer = ColumnTransformer([("cat_imputer", cat_imputer, cat_features), 
                             ('door_imputer', door_imputer, door_features),
                             ('num_imputer', num_imputer, num_features)])
# transform data
filled_X = imputer.fit_transform(X, dict)
X = pd.DataFrame(filled_X, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
X

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


In [9]:
# wyświetlenie ile i gdzie brakuje danych (kolumna i ilość)
X.isna().sum() 

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [10]:
X

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


## Zamiana stringów na numeryki

In [11]:
# zamiana stringów 'Make', 'Colour', 'Doors' na dane numeryczne
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer

# Wybór kolumn
categorical_features = ['Make', 'Colour', 'Doors']
# Koduj cechy kategoryczne jako tablicę numeryczną z jednym punktem.
one_hot = OneHotEncoder()
# Stosuje transformatory do kolumn tablicy lub pandas DataFrame.
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough') 
# Dopasuj wszystkie transformatory, przekształć dane i połącz wyniki.
transformed_X = transformer.fit_transform(X, dict) 
# DataFrame gotowe do ML
X = pd.DataFrame.sparse.from_spmatrix(transformed_X) # można pozmieniać nazwy kolumn używając parametru 'column=['nazwa1', ....]'
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0
