In [1]:
import pandas as pd
import numpy as np

In [95]:
titanic = pd.read_csv('data/titanic.csv')
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [82]:
#cleaning titanic dataset
def processing_titanic(dataframe):
    df= dataframe.copy()
    # Preprocessing
    df['family_size'] = df.SibSp + df.Parch + 1
    df['is_alone'] = (df.family_size == 0)
    df['has_cabin'] = df.Cabin.notnull()
    
    # Encoding Categorical Features
    df.drop('Cabin', axis=1, inplace=True)
    df.Sex = df.Sex.astype('category').cat.codes
    df.Embarked = df.Embarked.astype('category').cat.codes
    df.Name = df.Name.astype('category').cat.codes
    
    # Preprocessing Tickets
    df.Ticket = df.Ticket.str.split(' ').str[-1]
    
    # Handling Missing values
    df['Age_p'] = df.Age.isnull()
    df.loc[df.Age.isnull(), 'Age'] = df.Age.mean()
    df.Age = df.Age.astype('int')
    
    # Dropping records
    df.drop(df.loc[df.Ticket == 'LINE'].index, axis=0, inplace=True)
    
    return df.drop('Survived', axis = 1), df.Survived

In [83]:
x, y = processing_titanic(titanic)

In [84]:
x,y

(     PassengerId  Pclass  Name  Sex  Age  SibSp  Parch   Ticket     Fare  \
 0              1       3   108    1   22      1      0    21171   7.2500   
 1              2       1   190    0   38      1      0    17599  71.2833   
 2              3       3   353    0   26      0      0  3101282   7.9250   
 3              4       1   272    0   35      1      0   113803  53.1000   
 4              5       3    15    1   35      0      0   373450   8.0500   
 ..           ...     ...   ...  ...  ...    ...    ...      ...      ...   
 886          887       2   548    1   27      0      0   211536  13.0000   
 887          888       1   303    0   19      0      0   112053  30.0000   
 888          889       3   413    0   29      1      2     6607  23.4500   
 889          890       1    81    1   26      0      0   111369  30.0000   
 890          891       3   220    1   32      0      0   370376   7.7500   
 
      Embarked  family_size  is_alone  has_cabin  Age_p  
 0           2  

In [9]:
x.drop('Name', axis=1, inplace=True)

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
clf = RandomForestClassifier(100)

In [19]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [20]:
clf.score(X_test, y_test)

0.8202247191011236

In [21]:
clf.score(X_train, y_train)

1.0

In [22]:
clf.feature_importances_

array([0.1311207 , 0.06099808, 0.23449872, 0.12008865, 0.02518692,
       0.01932229, 0.13670859, 0.14481634, 0.02325973, 0.04499352,
       0.        , 0.04519836, 0.01380809])

In [24]:
feature_importance = pd.DataFrame({'importance' : clf.feature_importances_}, index = x.columns).sort_values('importance')

In [25]:
feature_importance

Unnamed: 0,importance
is_alone,0.0
Age_p,0.013808
Parch,0.019322
Embarked,0.02326
SibSp,0.025187
family_size,0.044994
has_cabin,0.045198
Pclass,0.060998
Age,0.120089
PassengerId,0.131121


In [26]:
to_keep = feature_importance[feature_importance.importance > 0.05].index

In [27]:
to_keep

Index(['Pclass', 'Age', 'PassengerId', 'Ticket', 'Fare', 'Sex'], dtype='object')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(x[to_keep], y, test_size=0.2)

In [31]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [32]:
clf.score(X_test, y_test)

0.8314606741573034

In [33]:
x,y

(     PassengerId  Pclass  Sex  Age  SibSp  Parch   Ticket     Fare  Embarked  \
 0              1       3    1   22      1      0    21171   7.2500         2   
 1              2       1    0   38      1      0    17599  71.2833         0   
 2              3       3    0   26      0      0  3101282   7.9250         2   
 3              4       1    0   35      1      0   113803  53.1000         2   
 4              5       3    1   35      0      0   373450   8.0500         2   
 ..           ...     ...  ...  ...    ...    ...      ...      ...       ...   
 886          887       2    1   27      0      0   211536  13.0000         2   
 887          888       1    0   19      0      0   112053  30.0000         2   
 888          889       3    0   29      1      2     6607  23.4500         2   
 889          890       1    1   26      0      0   111369  30.0000         0   
 890          891       3    1   32      0      0   370376   7.7500         1   
 
      family_size  is_alon

In [38]:
#LOFO

result = {}

for col in x.columns:
    temp_x = x.drop(col, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(temp_x, y, test_size=0.2)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    result[col] = [clf.score(X_train, y_train), clf.score(X_test, y_test)]

In [44]:
keep = pd.DataFrame(result).T.sort_values(by=1)
keep

Unnamed: 0,0,1
Sex,1.0,0.724719
family_size,1.0,0.775281
has_cabin,1.0,0.775281
Parch,1.0,0.797753
Fare,1.0,0.808989
Pclass,1.0,0.814607
Ticket,1.0,0.814607
Age_p,1.0,0.814607
PassengerId,0.99859,0.825843
SibSp,1.0,0.825843


In [57]:
to_keep = keep[keep[1] < 0.8].index

In [58]:
to_keep

Index(['Sex', 'family_size', 'has_cabin', 'Parch'], dtype='object')

In [59]:
X_train, X_test, y_train, y_test = train_test_split(x[to_keep], y, test_size=0.2)
clf.fit(X_train, y_train)

RandomForestClassifier()

In [60]:
clf.score(X_test, y_test)

0.7865168539325843

In [62]:
! pip install lofo-importance

Collecting lofo-importance
  Downloading lofo_importance-0.2.6-py3-none-any.whl (10 kB)
Collecting lightgbm
  Downloading lightgbm-3.1.0-py2.py3-none-win_amd64.whl (751 kB)
Installing collected packages: lightgbm, lofo-importance
Successfully installed lightgbm-3.1.0 lofo-importance-0.2.6


In [85]:
from lofo.lofo_importance import LOFOImportance

In [65]:
x, y = processing_titanic(titanic)
x,y

(     PassengerId  Pclass  Name  Sex  Age  SibSp  Parch   Ticket     Fare  \
 0              1       3   108    1   22      1      0    21171   7.2500   
 1              2       1   190    0   38      1      0    17599  71.2833   
 2              3       3   353    0   26      0      0  3101282   7.9250   
 3              4       1   272    0   35      1      0   113803  53.1000   
 4              5       3    15    1   35      0      0   373450   8.0500   
 ..           ...     ...   ...  ...  ...    ...    ...      ...      ...   
 886          887       2   548    1   27      0      0   211536  13.0000   
 887          888       1   303    0   19      0      0   112053  30.0000   
 888          889       3   413    0   29      1      2     6607  23.4500   
 889          890       1    81    1   26      0      0   111369  30.0000   
 890          891       3   220    1   32      0      0   370376   7.7500   
 
      Embarked  family_size  is_alone  has_cabin  Age_p  
 0           2  

In [88]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [94]:
from sklearn.model_selection import KFold
%matplotlib inline

cv = KFold(n_splits=4, shuffle=False, random_state=0)

target = "Survived"

lofo_imp = LOFOImportance(X_train, x.columns, target, cv=cv, scoring="roc_auc")

TypeError: __init__() got multiple values for argument 'scoring'

In [92]:
imp = lofo.get_importance()

AttributeError: 'DataFrame' object has no attribute 'getX'

In [70]:
from lofo import plot_importance

In [72]:
plot_importance(imp, figsize=(12,12))

AttributeError: 'function' object has no attribute 'copy'