## Zaczynamy od zaimportowania potrzebnych rozszerzeń

In [None]:
import pandas as pd
import numpy as np

## Importujemy dane z pliku

In [2]:
df = pd.read_csv('../data/01_raw/data.csv', skipinitialspace=True)

## Podstawowe informacje

In [3]:
print(f"Liczba wierszy: {df.shape[0]}")
print(f"Liczba kolumn: {df.shape[1]}")
print("\nPierwsze 5 wierszy:")
df.head()

Liczba wierszy: 500
Liczba kolumn: 15

Pierwsze 5 wierszy:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Informacje o typach danych w poszczególnych kolumnach
Jak możemy zobaczy, to z pozoru nie ma w tym zestawie danych żadnych wartości null, aczkolwiek to nie prawda. Takie wartości są w nim jak najbardziej obecne, oznaczone są jednak poprzez `?`.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             500 non-null    int64 
 1   workclass       500 non-null    object
 2   fnlwgt          500 non-null    int64 
 3   education       500 non-null    object
 4   education-num   500 non-null    int64 
 5   marital-status  500 non-null    object
 6   occupation      500 non-null    object
 7   relationship    500 non-null    object
 8   race            500 non-null    object
 9   sex             500 non-null    object
 10  capital-gain    500 non-null    int64 
 11  capital-loss    500 non-null    int64 
 12  hours-per-week  500 non-null    int64 
 13  native-country  500 non-null    object
 14  income          500 non-null    object
dtypes: int64(6), object(9)
memory usage: 58.7+ KB


## Zastąpienie `?` na wartości `null`
Po zmianie zobaczyć możemy faktyczną ilość brakujących danych

In [5]:
df = df.replace('?', np.nan)

# Print basic info
row_with_missing_data = df.isnull().any(axis = 1).sum()
print(f'Wiersze z brakującymi danymi: {row_with_missing_data}\n')
df.info()

Wiersze z brakującymi danymi: 39

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             500 non-null    int64 
 1   workclass       469 non-null    object
 2   fnlwgt          500 non-null    int64 
 3   education       500 non-null    object
 4   education-num   500 non-null    int64 
 5   marital-status  500 non-null    object
 6   occupation      469 non-null    object
 7   relationship    500 non-null    object
 8   race            500 non-null    object
 9   sex             500 non-null    object
 10  capital-gain    500 non-null    int64 
 11  capital-loss    500 non-null    int64 
 12  hours-per-week  500 non-null    int64 
 13  native-country  490 non-null    object
 14  income          500 non-null    object
dtypes: int64(6), object(9)
memory usage: 58.7+ KB


## Oczyszczanie danych
W celu oczyszczenia zbioru danych z brakujących danych możemy usunąć wiersze posiadające takie braki.

In [6]:
df_cleaned = df.dropna()

removed_rows = len(df) - len(df_cleaned)
print(f'Usunięto {removed_rows} wierszy.\n')

df_cleaned.info()

Usunięto 39 wierszy.

<class 'pandas.core.frame.DataFrame'>
Index: 461 entries, 0 to 498
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             461 non-null    int64 
 1   workclass       461 non-null    object
 2   fnlwgt          461 non-null    int64 
 3   education       461 non-null    object
 4   education-num   461 non-null    int64 
 5   marital-status  461 non-null    object
 6   occupation      461 non-null    object
 7   relationship    461 non-null    object
 8   race            461 non-null    object
 9   sex             461 non-null    object
 10  capital-gain    461 non-null    int64 
 11  capital-loss    461 non-null    int64 
 12  hours-per-week  461 non-null    int64 
 13  native-country  461 non-null    object
 14  income          461 non-null    object
dtypes: int64(6), object(9)
memory usage: 57.6+ KB
