In [12]:
# Data handling
import numpy as np
import pandas as pd

# For visualization
import altair as alt

# Feature Selection
from sklearn.feature_selection import RFE

# Models
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Split
from sklearn.model_selection import train_test_split

# Data

In [29]:
# Training data
train = pd.read_csv("../data/processed_data/home_train.csv") 
X = train.drop(columns = ['CLAIM', 'Unnamed: 0'])
y = train['CLAIM']

# Splitting the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=1234)

In [30]:
print(f"X train shape: {X_train.shape}")
print(f"X valid shape: {X_valid.shape}")
print(f"y train shape: {y_train.shape}")
print(f"y valid shape: {y_valid.shape}")

X train shape: (74009, 5)
X valid shape: (18503, 5)
y train shape: (74009,)
y valid shape: (18503,)


# Feature selection

In this section I will select the features

In [31]:
def fit_and_report(model, X, y, Xv, yv, mode = 'regression'):
    """
    Fits a given model and calculates its score in the training and the validation set.
    ----------------------------------------------------
    
    Paramaters:
    ----------------------------------------------------
    model: model to fit
    X: Training X matrix
    y: Training response vector
    Xv: Validation X matrix
    yv: Validation response vector
    mode: Type of estimation classification and
    
    Returns:
    ---------------------------------------------------
    errors: list with the training and validation error
    
    Example:
    --------------------------------------------------
    fit_and_report(LogisticRegression(), X, y, Xv, yv, mode = 'classification')
    """
    model.fit(X, y)
    if mode.lower().startswith('regress'):
        errors = [mean_squared_error(y, model.predict(X)), mean_squared_error(yv, model.predict(Xv))]
    if mode.lower().startswith('classif'):
        errors = [1 - model.score(X,y), 1 - model.score(Xv,yv)]        
    return errors

In [37]:
n_features = 3
results = {'Stage':[],
           'N_features':[],
           'Score':[]}

for i in range(1, n_features + 1):
    
    lr = LogisticRegression()
    rfe = RFE(estimator = lr, n_features_to_select = i)
    rfe.fit(X_train, y_train)
    
    selected_features = rfe.support_
    print(selected_features)
    scores = fit_and_report(lr, 
                            X_train[selected_features], 
                            y_train, 
                            X_valid[selected_features], 
                            y_valid,
                            mode="classification")
    
    results['Stage'].append('Train')
    results['N_features'].append(i)
    results['Score'].append(scores[0])
    results['Stage'].append('Validation')
    results['N_features'].append(i)
    results['Score'].append(scores[1])

[ True False False False False]


ValueError: Item wrong length 5 instead of 74009.

In [36]:
X_train

Unnamed: 0,SUM_INSURED_BUILDINGS,SUM_INSURED_CONTENTS,BUS_USE_Y,CLERICAL_N,CLERICAL_Y
51258,0.586421,-4.618236,0.0,0.0,0.0
37319,0.586421,0.197910,0.0,0.0,0.0
90024,0.586421,0.197910,0.0,0.0,0.0
90976,-1.705260,0.197910,0.0,0.0,0.0
40715,0.586421,0.197910,0.0,0.0,0.0
...,...,...,...,...,...
82584,0.586421,-4.618236,0.0,0.0,0.0
89460,0.586421,0.197910,0.0,0.0,0.0
60620,0.586421,0.197910,0.0,0.0,0.0
34086,0.586421,0.197910,1.0,0.0,1.0
