# Cardinality Exercise
* Variables with too many labels tend to dominate over those with only a few labels
* A big number of labels within a variable may introduce noise and underfitting
* High cardinality leadsa to uneven category distribution between training and test sets and some of the labels may be present only in the trainin data set but not the test

In [67]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

In [68]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [69]:
print('Dataframe Shape:', df.shape)
print('Cardianlity of Name col:', len(df['Name'].unique()))
print('Cardianlity of Sex col', len(df['Sex'].unique()))
print('Cardinality of Cabin col:', len(df['Cabin'].unique()))

Dataframe Shape: (891, 13)
Cardianlity of Name col: 891
Cardianlity of Sex col 2
Cardinality of Cabin col: 148


In [70]:
c_df = df[['Cabin', 'Sex', 'Survived']]
c_df.head()

Unnamed: 0,Cabin,Sex,Survived
0,,male,0
1,C85,female,1
2,,female,1
3,C123,female,1
4,,male,0


In [71]:
c_df['Cabin'].str[0].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [72]:
c_df['Cabin'].str[0].fillna('n').unique()

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [73]:
c_df['cabin_reduced'] = c_df['Cabin'].str[0].fillna('n')
c_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Cabin,Sex,Survived,cabin_reduced
0,,male,0,n
1,C85,female,1,C
2,,female,1,n
3,C123,female,1,C
4,,male,0,n


In [74]:
print(len(c_df['Cabin'].unique()))
print(len(c_df['cabin_reduced'].unique()))
print(len(c_df['Sex'].unique()))

148
9
2


## Exploring values missing between train and test sets

* Showing how values can be missing from the train and test data after the train test split.
* This can effect the model and cause underfitting leding to low train set accuracy.

In [75]:
use_cols = ['Cabin', 'cabin_reduced', 'Sex']
X_train, X_test, y_train, y_test = train_test_split(c_df[use_cols], 
                                                   c_df['Survived'],
                                                    test_size = 0.4,
                                                    random_state = 32) 

In [76]:
X_train.shape, X_test.shape

((534, 3), (357, 3))

In [77]:
train_cabin_unique = X_train['Cabin'].unique()
test_cabin_unique = X_test['Cabin'].unique()
train_cabin_unique, test_cabin_unique

(array([nan, 'C124', 'C22 C26', 'B28', 'G6', 'E77', 'D19', 'D9', 'C30',
        'A10', 'D11', 'A23', 'T', 'C52', 'C23 C25 C27', 'D35', 'B35',
        'B86', 'C68', 'F E69', 'C123', 'A16', 'C104', 'B37', 'D26', 'C110',
        'B30', 'C46', 'C126', 'F33', 'C106', 'B49', 'A20', 'D36', 'B77',
        'A36', 'C99', 'E101', 'C125', 'B82 B84', 'C91', 'E58', 'C128',
        'B19', 'C101', 'C78', 'C49', 'B78', 'B51 B53 B55', 'D30', 'E67',
        'C90', 'B102', 'A6', 'F G73', 'F2', 'D46', 'D49', 'A19', 'E24',
        'C50', 'C118', 'C47', 'E8', 'B18', 'E68', 'D56', 'D17', 'B22',
        'B42', 'A34', 'D37', 'E49', 'B5', 'E33', 'B38', 'B20', 'E36',
        'C82', 'D33', 'C93', 'F4', 'E10', 'D48', 'C7', 'E46', 'E34',
        'C148', 'D47', 'B4', 'E38', 'D', 'E25', 'C65', 'E63', 'F G63',
        'D20', 'C2', 'E40', 'E121', 'C83', 'B3', 'A32', 'E44', 'C87',
        'C54'], dtype=object),
 array([nan, 'C92', 'C2', 'B71', 'B80', 'F38', 'D', 'E31', 'D20', 'B101',
        'C95', 'B79', 'D35', 'C111', 

In [78]:
len(train_cabin_unique) - len(test_cabin_unique)

33

In [79]:
# 76 values in train set are not in the test 
len([x for x in train_cabin_unique if x not in test_cabin_unique])

76

In [80]:
# 43 values in test set are not in the train 
len([x for x in test_cabin_unique if x not in train_cabin_unique])

43

* just one value missing from the testing set from the reduced cardinality set

In [81]:
len([x for x in X_train['cabin_reduced'].unique() if x not in X_test['cabin_reduced'].unique()])

1

In [82]:
len([x for x in X_test['cabin_reduced'].unique() if x not in X_train['cabin_reduced'].unique()])

0

## Categorical Encoding 

In [83]:
X_train.isna().sum()

Cabin            408
cabin_reduced      0
Sex                0
dtype: int64

In [84]:
X_test.isna().sum()

Cabin            279
cabin_reduced      0
Sex                0
dtype: int64

In [85]:
X_train['Cabin'].fillna(0)
X_test['Cabin'].fillna(0)

837      0
334      0
849    C92
778      0
413      0
      ... 
383      0
752      0
124    D26
784      0
216      0
Name: Cabin, Length: 357, dtype: object

In [86]:
train = pd.get_dummies(X_train[['Cabin', 'Sex']], columns=['Cabin', 'Sex'])
test = pd.get_dummies(X_test[['Cabin', 'Sex']], columns=['Cabin', 'Sex'])

In [87]:
train.shape, test.shape

((534, 107), (357, 74))

### Balancing the columns

In [88]:
missing_cols = set(train.columns) - set(test.columns)

for c in missing_cols:
    test[c] = 0
    
test = test[train.columns]

In [89]:
clf = RandomForestClassifier(n_estimators = 200, random_state=32)
clf.fit(train, y_train)

RandomForestClassifier(n_estimators=200, random_state=32)

In [90]:
y_train_pred = clf.predict_proba(train)
y_test_pred = clf.predict_proba(test)

In [91]:
y_train_pred

array([[0.3935532 , 0.6064468 ],
       [0.95601304, 0.04398696],
       [0.87045605, 0.12954395],
       ...,
       [0.17895308, 0.82104692],
       [0.87045605, 0.12954395],
       [0.3935532 , 0.6064468 ]])

In [92]:
print(' High Cardinality Train Set', '\n', '{} roc_auc: {}'.format(
    'Random Forest', roc_auc_score(y_train, y_train_pred[:,1])))
      
print(' High Cardinality Test Set', '\n', '{} roc_auc: {}'.format(
    'Random Forest', roc_auc_score(y_test, y_test_pred[:,1])))

 High Cardinality Train Set 
 Random Forest roc_auc: 0.8697128814128494
 High Cardinality Test Set 
 Random Forest roc_auc: 0.7967273194882016


In [93]:
def run_models(data_train, data_test, y_train, y_test):

    lr = LogisticRegression(solver='lbfgs', random_state = 32)
    rf = RandomForestClassifier(n_estimators=200, random_state = 32)
    ada = AdaBoostClassifier(n_estimators=200, random_state = 32)
    gb = GradientBoostingClassifier(n_estimators=300, random_state = 32)
    
    models = {
        'LogisticRegression': lr,
        'Random Forest': rf,
        'Ada Boost': ada,
        'Gradient Boosting':gb 
          }
    
    train = pd.get_dummies(data_train, columns = data_train.columns)
    test = pd.get_dummies(data_test, columns = data_test.columns)
    
    missing_cols = set(train.columns) - set(test.columns)
    for col in missing_cols:
        test[col] = 0 
        
    test = test[train.columns]
    
    for label, clf in models.items():
        clf.fit(train, y_train)
        
        y_train_pred = clf.predict_proba(train)
        y_test_pred = clf.predict_proba(test)
        
        print(label)
        print('training set roc_auc: {}'.format(roc_auc_score(y_train, y_train_pred[:,1])))
        print('testing set roc_auc: {}'.format(roc_auc_score(y_test, y_test_pred[:,1])))
        print('\n')

In [94]:
cols = ['Cabin', 'Sex']
run_models(X_train[cols], X_test[cols], y_train, y_test)

LogisticRegression
training set roc_auc: 0.8415657961015371
testing set roc_auc: 0.798527615251077


Random Forest
training set roc_auc: 0.8697128814128494
testing set roc_auc: 0.7967273194882016


Ada Boost
training set roc_auc: 0.8651489017446918
testing set roc_auc: 0.7588568121905742


Gradient Boosting
training set roc_auc: 0.8574405079907805
testing set roc_auc: 0.7974185044685912




In [95]:
cols = ['cabin_reduced', 'Sex']
run_models(X_train[cols], X_test[cols], y_train, y_test)

LogisticRegression
training set roc_auc: 0.8133423900599881
testing set roc_auc: 0.8359801967466083


Random Forest
training set roc_auc: 0.814517729305634
testing set roc_auc: 0.8352407895582845


Ada Boost
training set roc_auc: 0.8130828995771832
testing set roc_auc: 0.8284896804475022


Gradient Boosting
training set roc_auc: 0.814517729305634
testing set roc_auc: 0.8352407895582845


