## Importing modules

In [61]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

## Feature Engineering

You are tasked to predict whether a new cohort of loan applicants are likely to default on their loans. You have a historical dataset and wish to train a classifier on it. You notice that many features are in string format, which is a problem for your classifiers. You hence decide to encode the string columns numerically using LabelEncoder(). The function has been preloaded for you from the preprocessing submodule of sklearn. The dataset credit is also preloaded, as is a list of all column names whose data types are string, stored in non_numeric_columns.

In [2]:
credit = pd.read_csv('credit.csv')

In [7]:
# Inspect the first few lines of your data using head()
credit.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',buy_furniture_equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously',buy_new_car,4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


In [5]:
non_numeric_columns = ['checking_status',
 'credit_history',
 'purpose',
 'savings_status',
 'employment',
 'personal_status',
 'other_parties',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'job',
 'own_telephone',
 'foreign_worker']

In [9]:
# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])

# Inspect the data types of the columns of the data frame
print(credit.dtypes)

checking_status            int64
duration                   int64
credit_history             int64
purpose                    int64
credit_amount              int64
savings_status             int64
employment                 int64
installment_commitment     int64
personal_status            int64
other_parties              int64
residence_since            int64
property_magnitude         int64
age                        int64
other_payment_plans        int64
housing                    int64
existing_credits           int64
job                        int64
num_dependents             int64
own_telephone              int64
foreign_worker             int64
class                     object
dtype: object


In [33]:
y = credit['class']
X = credit.iloc[:,0:20]

In [35]:
X.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,1,6,1,4,1169,4,3,4,3,2,4,2,67,1,1,2,3,1,1,1
1,0,48,3,4,5951,2,0,2,0,2,2,2,22,1,1,1,3,1,0,1
2,3,12,1,6,2096,2,1,2,3,2,3,2,49,1,1,1,2,2,0,1
3,1,42,3,2,7882,2,1,2,3,1,4,0,45,1,0,1,3,2,0,1
4,1,24,2,3,4870,2,0,3,3,2,4,1,53,1,0,2,3,2,0,1


## Your first pipeline

Your colleague has used AdaBoostClassifier for the credit scoring dataset. You want to also try out a random forest classifier. In this you will fit this classifier to the data and compare it to AdaBoostClassifier. Make sure to use train/test data splitting to avoid overfitting. The data is preloaded and transformed so that all features are numeric. The features are available as X and the labels as y. The module RandomForestClassifier has also been preloaded.

In [36]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1)

In [47]:
# Create a random forest classifier, fixing the seed to 2
rf_model = RandomForestClassifier(random_state=2).fit(
  X_train, y_train)

# Use it to predict the labels of the test data
rf_predictions = rf_model.predict(X_test)

# Assess the accuracy of both classifiers
accuracies = {'ab':0.75}

accuracies['rf'] = accuracy_score(y_test, rf_predictions)

accuracies['rf'] = AdaBoostClassifier(y, rf_predictions)

## Model fitting and compexity

### Grid search CV for model complexity

how most classifiers have one or more hyperparameters that control its complexity. You also learned to tune them using GridSearchCV(). In this exercise, you will perfect this skill. You will experiment with:

The number of trees, n_estimators, in a RandomForestClassifier.
The maximum depth, max_depth, of the decision trees used in an AdaBoostClassifier.
The number of nearest neighbors, n_neighbors, in KNeighborsClassifier

In [50]:
# first(define the paramter grid as described and create a grid object with as Randomforest classifer) 

# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': list(range(10, 41, 10))}

# Optimize for a RandomForestClassifier using GridSearchCV
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_estimators': 40}

In [51]:
# Second(Adapt your code to optimize max_depth for and AdaBoostClassifier)
# Define a grid for n_estimators ranging from 1 to 10
param_grid = {'n_estimators': list(range(1, 11))}

# Optimize for a AdaBoostClassifier using GridSearchCV
grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_estimators': 10}

In [54]:
# Third
# Define a grid for n_neighbors with values 10, 50 and 100
param_grid = {'n_neighbors': [10, 50, 100]}

# Optimize for KNeighborsClassifier using GridSearchCV
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
grid.fit(X, y)
grid.best_params_

{'n_neighbors': 50}

## Feature engineering and overfitting

### Categorical encodings

the columns in the credit dataset to numeric values using LabelEncoder(). He left one out: credit_history, which records the credit history of the applicant. You want to create two versions of the dataset. One will use LabelEncoder() and another one-hot encoding, for comparison purposes. The feature matrix is available to you as credit. You have LabelEncoder() preloaded and pandas as pd

In [56]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(
  credit['credit_history'])

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], 1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat(
  [X, pd.get_dummies(credit['credit_history'])], 1)

# Compare the number of features of the resulting DataFrames
X_hot.shape[1] > X_num.shape[1]

True

### Feature transformation

You are discussing the credit dataset with the bank manager. She suggests that the safest loan applications tend to request mid-range credit amounts. Values that are either too low or too high suggest high risk. This means that a non-linear relationship might exist between this variable and the class. You want to test this hypothesis. You will construct a non-linear transformation of the feature. Then, you will compare its association with the class to the original feature. You will use the f_classif scoring function from the last lesson to measure association strength.

The data is available as a pandas DataFrame called credit, with the class contained in the column class. You have preloaded f_classif, pandas as pd and numpy as np

In [59]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x-np.mean(x))

# Apply it to the credit amount and store to new column
credit['credit_amount_diff'] = abs_diff(credit['credit_amount'])

# Score old and new versions of this feature with f_classif()
scores = f_classif(credit[['credit_amount', 'credit_amount_diff']], credit['class'])[0]

# Inspect the scores and drop the lowest-scoring feature
credit_new = credit.drop(['credit_amount'], 1)

### Bringing it all together

You just joined an arrhythmia detection startup and want to train a model on the arrhythmias dataset. You noticed that random forests tend to win quite a few Kaggle competitions, so you want to try that out with a maximum depth of 2, 5, or 10, using grid search. You also observe that the dimension of the dataset is quite high so you wish to consider the effect of a feature selection method.

To make sure you don't overfit by mistake, you have already split your data. You will use X_train and y_train for the grid search, and X_test and y_test to decide if feature selection helps. All four dataset folds are preloaded in your environment. You also have access to GridSearchCV(), train_test_split(), SelectKBest(), f_classif() and RandomForestClassifier as rfc.

In [65]:
# Find the best value for max_depth among values 2, 5 and 10
grid_search = GridSearchCV(RandomForestClassifier(random_state=1), param_grid={'max_depth': [2, 5, 10]})
best_value = grid_search.fit(X_train, y_train).best_params_['max_depth']

# Using the best value from above, fit a random forest
clf = RandomForestClassifier(random_state=1, max_depth=best_value).fit(X_train, y_train)

# Apply SelectKBest with f_classif and pick top 100 features
vt = SelectKBest(f_classif, k='all').fit(X_train, y_train)

# Refit the classifier using best_depth on the reduced data
clf_vt = RandomForestClassifier(random_state=1, max_depth=best_value).fit(vt.transform(X_train), y_train)