# **Overfit and cross validation**

In [417]:
import pandas as pd
import numpy as np
import itertools 
from multiprocessing.dummy import Pool
import sklearn
import matplotlib.pyplot as plt

- In this class we are going to use a credit default data base, to try to predict if a person is going to enter in default, given the person caracteristics. You can find the databes [here](https://www.kaggle.com/c/credit-default-prediction-ai-big-data/rules)

In [428]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [429]:
test.isna().sum()

Id                                 0
Home Ownership                     0
Annual Income                    513
Years in current job              86
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    1358
Bankruptcies                       3
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                     513
dtype: int64

In [430]:
train.isna().sum()

Id                                 0
Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [431]:
la=[train,test]
for i in la:
    i['Years in current job']=i['Years in current job'].replace(np.nan,'mis')
    i['Annual Income']=i['Annual Income'].replace(np.nan,0)
    i['Credit Score']=i['Credit Score'].replace(np.nan,0)
  
    

In [432]:
train=train.drop(columns=['Months since last delinquent'])
train=train[train['Bankruptcies'].isnull()==False]
test=test.drop(columns=['Months since last delinquent'])
test=test[test['Bankruptcies'].isnull()==False]

In [436]:
train.columns

Index(['Id', 'Home Ownership', 'Annual Income', 'Years in current job',
       'Tax Liens', 'Number of Open Accounts', 'Years of Credit History',
       'Maximum Open Credit', 'Number of Credit Problems', 'Bankruptcies',
       'Purpose', 'Term', 'Current Loan Amount', 'Current Credit Balance',
       'Monthly Debt', 'Credit Score', 'Credit Default'],
      dtype='object')

In [437]:
dummies= pd.get_dummies(train[['Home Ownership', 'Years in current job','Bankruptcies','Number of Credit Problems','Purpose','Term','Credit Default','Number of Open Accounts']])
train=train.drop(columns=['Home Ownership', 'Years in current job','Bankruptcies','Number of Credit Problems','Purpose','Term','Credit Default','Number of Open Accounts'])
train = pd.concat([train, dummies], axis=1)

In [438]:
def dummies(categoricas,dataframe):
    dummies= pd.get_dummies(dataframe[categoricas])
    dataframe=dataframe.drop(columns=categoricas)
    dataframe = pd.concat([dataframe, dummies], axis=1)
    return dataframe
    

In [439]:
test=dummies(['Home Ownership', 'Years in current job','Bankruptcies','Number of Credit Problems','Purpose','Term','Number of Open Accounts'],test)

## **70%-30% validation approach**

- Sklearn logistic [regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

-[Accuracy metric](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score)

In [441]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [442]:
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['Credit Default']),train['Credit Default'], test_size=0.30,
                                                    random_state=200,
                                                    shuffle=True)

In [444]:
clf = LogisticRegression(max_iter=20000).fit(x_train, y_train)

In [147]:
clf.predict_proba(x_test)

array([[0.51162559, 0.48837441],
       [0.76363346, 0.23636654],
       [0.62475843, 0.37524157],
       ...,
       [0.81345159, 0.18654841],
       [0.76360154, 0.23639846],
       [0.65627916, 0.34372084]])

In [445]:
y_pred=clf.predict(x_test)

In [446]:
accuracy_score(y_test, y_pred)

0.7600178094390027

## **Cross Validation approach**

- Supouse we only had the following feautures **('Annual Income', 'Tax Liens', 'Years of Credit History','Maximum Open Credit', 'Current Loan Amount', 'Current Credit Balance','Monthly Debt', 'Credit Score', 'Bankruptcies')**, and we wanted to know which linear combination of this feautures gets the best out of sample error.

In [449]:
train2=train[['Annual Income', 'Tax Liens', 'Years of Credit History',
       'Maximum Open Credit', 'Current Loan Amount', 'Current Credit Balance',
       'Monthly Debt', 'Credit Score', 'Bankruptcies','Credit Default']]

In [237]:
lista=['Annual Income', 'Tax Liens', 'Years of Credit History','Maximum Open Credit', 'Current Loan Amount', 'Current Credit Balance','Monthly Debt', 'Credit Score', 'Bankruptcies']

In [492]:
sublists=[]
for L in range(0, len(lista)+1):
    for sublist in itertools.combinations(lista, L):
        sublists.append(list(sublist))
        
        

In [454]:
la=int(train2.shape[0]/5)
a=0
cortes=[]
for i in range(0,k):
    y=[]
    y.append(a)
    a+=la
    y.append(a)
    cortes.append(y)

In [None]:
!ipynb-py-convert function.ipynb function1.py
from function1 import fit_model_cross_approach

In [495]:
def fit_model_cross_approach(la):
    global sublists,train2 
    k=5
    x=sublists[la]
    dataframe=train2
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)
    la=int(dataframe.shape[0]/k)
    a=0
    cortes=[]
    for i in range(0,k):
        y=[]
        y.append(a)
        a+=la
        y.append(a)
        cortes.append(y)
    metrics=[]
    for i in cortes:
        test=dataframe[i[0]:i[1]]
        train=dataframe.drop(index=list(range(i[0],i[1])))
        clf = LogisticRegression(max_iter=20000).fit(train[x], train['Credit Default'])
        y_pred=clf.predict(test[x])
        metrics.append(accuracy_score(test['Credit Default'], y_pred))
    return sum(metrics)/k


In [488]:
errores=[]

In [496]:
fit_model_cross_approach(1)

0.7183700734802938

- Let´s run the function for all the positions of the list sublists anda appende the k-fold error un the list errores

In [497]:

for i in range(1,len(sublists)):
    errores.append(fit_model_cross_approach(i))

    
    

In [501]:
max_value = max(errores)
al=errores.index(max_value)



-The model that give us the best out of sample prediction is:
    

In [502]:
sublists[al-1]

['Tax Liens', 'Years of Credit History', 'Current Credit Balance']

- **Overfitting and Lasso**

- common notation l1=Lasso l2=ridge

In [476]:
from sklearn.linear_model import SGDClassifier
sgd_clf = LogisticRegression(max_iter=200000,penalty='l1', solver='liblinear',tol=0.3,random_state = 100)

In [477]:
clf1 =sgd_clf.fit(x_train, y_train)

In [478]:
y_pred1=clf1.predict(x_test)

In [479]:
accuracy_score(y_test, y_pred1)

0.7662511130899377

In [None]:
sklearn.metrics.SCORERS.keys()
LogisticRegression(max_iter=200000).get_params().keys()

In [482]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [483]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
hyper_params = [{'tol': [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]}]
sgd_clf = LogisticRegression(max_iter=200000,penalty='l1',solver='liblinear',random_state = 1005)

            
model_cv = GridSearchCV(estimator = sgd_clf, 
                        param_grid = hyper_params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True
                       ,n_jobs=10)      
bla=model_cv.fit(x_train, y_train)  

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  45 out of  45 | elapsed:    2.1s finished


In [484]:
model_cv.best_estimator_

LogisticRegression(max_iter=200000, penalty='l1', random_state=1005,
                   solver='liblinear', tol=0.2)

In [485]:
y_pred2=bla.predict(x_test)
accuracy_score(y_test, y_pred2)

0.767586821015138

In [487]:
pd.DataFrame(model_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.01516,0.002632,0.0,0.0,0.0,{'tol': 0.0},,,,,...,,,9,,,,,,,
1,0.020545,0.000488,0.01097,0.001091,0.1,{'tol': 0.1},0.780534,0.765267,0.789122,0.770038,...,0.777099,0.008462,2,0.77791,0.779819,0.775286,0.779819,0.777672,0.778101,0.001676
2,0.018772,0.00223,0.012141,0.000771,0.2,{'tol': 0.2},0.783397,0.76813,0.788168,0.770038,...,0.778053,0.007743,1,0.777433,0.781727,0.774332,0.78125,0.778626,0.778674,0.002696
3,0.018151,0.001164,0.005984,0.00309,0.3,{'tol': 0.3},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
4,0.021744,0.005896,0.006182,0.00391,0.4,{'tol': 0.4},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
5,0.020742,0.003179,0.004788,0.000747,0.5,{'tol': 0.5},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
6,0.019148,0.001164,0.005984,0.002601,0.6,{'tol': 0.6},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
7,0.018151,0.000746,0.004014,0.00067,0.7,{'tol': 0.7},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
8,0.016853,0.002676,0.002792,0.000977,0.8,{'tol': 0.8},0.785305,0.767176,0.783397,0.771947,...,0.776718,0.006828,3,0.775763,0.780773,0.773616,0.778387,0.777195,0.777147,0.002411
