# Predspracovnaie dat

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
pd.pandas.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore')

In [6]:
data = pd.read_csv('train_titanic.csv')
print(data.shape)
data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    data['Survived'],
                                                    test_size=0.1,
                                                    random_state=0)  

X_train.shape, X_test.shape

((801, 12), (90, 12))

## Chybajuce hodnoty

### Nominalne atributy - nahradenie s 'missing'

In [11]:
vars_with_na = [
        var for var in data.columns
        if X_train[var].dtypes == 'O'
    ]
print('Nominalne atributy: ', vars_with_na)

Nominalne atributy:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [9]:
vars_with_na = [
        var for var in data.columns
        if X_train[var].isnull().sum() > 0 and X_train[var].dtypes == 'O'
    ]
print('missing values in train: \n', X_train[vars_with_na].isnull().sum())
print('missing values in test: \n', X_test[vars_with_na].isnull().sum())

missing values in train: 
 Cabin       622
Embarked      2
dtype: int64
missing values in test: 
 Cabin       65
Embarked     0
dtype: int64


In [10]:
X_train[vars_with_na] = X_train[vars_with_na].fillna('Missing')
X_test[vars_with_na] = X_test[vars_with_na].fillna('Missing')

print('\nCHECK - missing values in train: \n', X_train[vars_with_na].isnull().sum())
print('\nCHECK - missing values in test: \n', X_test[vars_with_na].isnull().sum())


CHECK - missing values in train: 
 Cabin       0
Embarked    0
dtype: int64

CHECK - missing values in test: 
 Cabin       0
Embarked    0
dtype: int64


## Numericke atributy nahradene s mode hodnotou

In [12]:
vars_with_na = [
        var for var in data.columns
        if X_train[var].dtypes != 'O']
print('Numericke atributy: ', vars_with_na)

Numericke atributy:  ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [13]:
vars_with_na = [
        var for var in data.columns
        if X_train[var].isnull().sum() > 0 and X_train[var].dtypes != 'O'
    ]
print('missing values in train: \n', X_train[vars_with_na].isnull().sum())
print('missing values in test: \n', X_test[vars_with_na].isnull().sum())

missing values in train: 
 Age    158
dtype: int64
missing values in test: 
 Age    19
dtype: int64


In [14]:
for var in vars_with_na:
    # vypocet mode pre danu premennu
    mode_val = X_train[var].mode()[0]
    # pridanie premennej oznacujucej ci zaznam obsahuje chybajucu hodnotu pre danu premennu
    X_train[var + '_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var + '_na'] = np.where(X_test[var].isnull(), 1, 0)
    # nahradenie chybajucej hodnoty
    # (in train and test)
    X_train[var] = X_train[var].fillna(mode_val)
    X_test[var] = X_test[var].fillna(mode_val)

In [16]:
print('CHECK - missing values in train: \n', X_train[vars_with_na].isnull().sum())
print('CHECK - missing values in test: \n', X_test[vars_with_na].isnull().sum())

CHECK - missing values in train: 
 Age    0
dtype: int64
CHECK - missing values in test: 
 Age    0
dtype: int64


In [19]:
print('\nAll data check: \n ', X_train.isnull().sum())


All data check: 
  PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Age_na         0
dtype: int64


## Odstranenie rare hodnot

In [20]:
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O']
print('Nominalne atributy: ', cat_vars)

Nominalne atributy:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [23]:
def find_frequent_labels(df, var, rare_perc):
    df = df.copy()

    tmp = df.groupby(var)['Survived'].count() / len(df)

    return tmp[tmp > rare_perc].index

In [24]:
for var in cat_vars:
    # najdenie frekventovanych hodnot
    frequent_ls = find_frequent_labels(X_train, var, 0.01)
    print('Frequent label: ', frequent_ls)
    print()

        # nahradenie "rare" hodnot hodnotou "Rare"
    X_train[var] = np.where(X_train[var].isin(
        frequent_ls), X_train[var], 'Rare')

    X_test[var] = np.where(X_test[var].isin(
        frequent_ls), X_test[var], 'Rare')


Frequent label:  Index([], dtype='object', name='Name')

Frequent label:  Index(['female', 'male'], dtype='object', name='Sex')

Frequent label:  Index([], dtype='object', name='Ticket')

Frequent label:  Index(['Missing'], dtype='object', name='Cabin')

Frequent label:  Index(['C', 'Q', 'S'], dtype='object', name='Embarked')



## Zakodovanie nominalnych atributov na cisla

In [25]:
# nahradenie string hodnot cislami tak ze string pre ktore bola priemerna cena nizsia budu mat nizsiu ciselnu hodnotu

def replace_categories(train, test, var, target):

    # zoradenie hodnot premennej na zaklade priemernej ceny domov od najmensej po najvacsiu
    ordered_labels = train.groupby([var])[target].mean().sort_values().index

    # vytvorenie slovnika string - hodnota
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}

    # nahradenie string danou hodnotou v datasete
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [26]:
for var in cat_vars:
    replace_categories(X_train, X_test, var, 'Survived')

In [30]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
815,816,0,1,0,0,24.0,0,0,0,0.0,1,0,1
877,878,0,3,0,0,19.0,0,0,0,7.8958,0,0,0
193,194,1,2,0,0,3.0,1,1,0,26.0,1,0,0
523,524,1,1,0,1,44.0,0,1,0,57.9792,1,2,0
634,635,0,3,0,1,9.0,3,2,0,27.9,0,0,0


## Normalizacia rozsahu hodnot

In [33]:
train_vars = [var for var in X_train.columns if var not in ['PassengerId', 'Survived']]
print('trenovacie premenne: ', train_vars)
len(train_vars)

trenovacie premenne:  ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_na']


11

In [34]:
scaler = MinMaxScaler()

scaler.fit(X_train[train_vars]) 

X_train[train_vars] = scaler.transform(X_train[train_vars])

X_test[train_vars] = scaler.transform(X_test[train_vars])

In [35]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
815,816,0,0.0,0.0,0.0,0.294088,0.0,0.0,0.0,0.0,1.0,0.0,1.0
877,878,0,1.0,0.0,0.0,0.23106,0.0,0.0,0.0,0.015412,0.0,0.0,0.0
193,194,1,0.5,0.0,0.0,0.029371,0.125,0.166667,0.0,0.050749,1.0,0.0,0.0
523,524,1,0.0,0.0,1.0,0.546199,0.0,0.166667,0.0,0.113168,1.0,0.666667,0.0
634,635,0,1.0,0.0,1.0,0.105004,0.375,0.333333,0.0,0.054457,0.0,0.0,0.0


In [36]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)