In [1]:
#Import all the needed libraries and the training data from a csv file

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc

credit_rf  = pd.read_csv('cs-training.csv')

In [2]:
#Get top 5 rows from the dataframe
credit_rf.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
1,2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
2,3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
3,4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
4,5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [3]:
#Get botton 5 rows from the dataframe
credit_rf.tail()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2
149997,149998,0,0.246044,58,0,3870.0,,18,0,1,0,0
149998,149999,0,0.0,30,0,0.0,5716.0,4,0,0,0,0
149999,150000,0,0.850283,64,0,0.249908,8158.0,8,0,2,0,0


In [4]:
#Remove unnamed column
credit_rf = credit_rf.drop('Unnamed: 0', 1)

In [5]:
#Import library to get correlatio
import statsmodels.formula.api as smf

In [7]:
#Get correlation information from the data frame
credit_rf.corr()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.019746,-0.029669,0.117175,-0.007038,0.102261,0.046048
RevolvingUtilizationOfUnsecuredLines,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.007124,-0.011281,-0.001061,0.006235,-0.001048,0.001557
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.037717,0.147705,-0.061005,0.03315,-0.057159,-0.213303
NumberOfTime30-59DaysPastDueNotWorse,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.010217,-0.055312,0.983603,-0.030565,0.987005,-0.00268
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.028712,0.049565,-0.00832,0.120046,-0.007533,-0.040673
MonthlyIncome,-0.019746,0.007124,0.037717,-0.010217,-0.028712,1.0,0.091455,-0.012743,0.124959,-0.011116,0.062647
NumberOfOpenCreditLinesAndLoans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.091455,1.0,-0.079984,0.433959,-0.071077,0.065322
NumberOfTimes90DaysLate,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.012743,-0.079984,1.0,-0.045205,0.992796,-0.010176
NumberRealEstateLoansOrLines,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.124959,0.433959,-0.045205,1.0,-0.039722,0.124684
NumberOfTime60-89DaysPastDueNotWorse,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.011116,-0.071077,0.992796,-0.039722,1.0,-0.010922


In [8]:
#Get the number of nulls per column
credit_rf.isnull().sum() 

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [9]:
#Fill the null values with the mean of the column
credit_rf.MonthlyIncome.fillna(credit_rf.MonthlyIncome.mean(), inplace=True)    

In [10]:
#Fill the null values with the mean of the column
credit_rf.NumberOfDependents.fillna(credit_rf.NumberOfDependents.mean(), inplace=True)  

In [11]:
credit_rf.isnull().sum() 

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [12]:
#Split data into train and validate
credit_rf['is_train'] = np.random.uniform(0, 1, len(credit_rf)) <= .75
credit_rf, validate = credit_rf[credit_rf['is_train']==True], credit_rf[credit_rf['is_train']==False]

In [13]:
#Get the top 10 rows from the validate data
validate.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,is_train
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,False
8,0,0.116951,27,0,46.0,6670.221237,2,0,0,0,0.757222,False
10,0,0.644226,30,0,0.309476,2500.0,5,0,0,0,0.0,False
13,1,0.964673,40,3,0.382965,13700.0,9,3,1,1,2.0,False
15,0,0.548458,64,0,0.209892,11362.0,7,0,1,0,2.0,False
22,0,1.0,39,0,0.241104,2500.0,4,0,0,0,0.0,False
28,0,0.452516,24,0,0.011761,3400.0,1,0,0,0,0.0,False
31,0,1.0,24,0,0.472703,750.0,1,0,0,0,0.0,False
33,0,0.186869,57,0,0.313812,7000.0,9,0,2,0,0.0,False
34,0,0.69333,42,2,0.257732,2230.0,7,0,0,0,0.0,False


In [14]:
#Get the frequency of SeriousDlqin2yrs from the validate data
validate.SeriousDlqin2yrs.value_counts()


0    34991
1     2492
Name: SeriousDlqin2yrs, dtype: int64

In [15]:
#Assigning the feature columns for the model
feature_cols =  ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

In [16]:
#Running Logistic Regression
x_train = credit_rf[feature_cols]
y_train = credit_rf.SeriousDlqin2yrs
lg = LogisticRegression()
lg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
x_train.head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0
5,0.213179,74,0,0.375607,3500.0,3,0,1,0,1
6,0.305682,57,0,5710.0,6670.221237,8,0,3,0,0
7,0.754464,39,0,0.20994,3500.0,8,0,0,0,0
9,0.189169,57,0,0.606291,23684.0,9,0,4,0,2
11,0.018798,51,0,0.531529,6501.0,7,0,2,0,2
12,0.010352,46,0,0.298354,12454.0,13,0,2,0,2


In [18]:
y_train.head(10)

0     1
1     0
3     0
4     0
5     0
6     0
7     0
9     0
11    0
12    0
Name: SeriousDlqin2yrs, dtype: int64

In [19]:
x_validate = validate[feature_cols]
y_validate = validate.SeriousDlqin2yrs

In [20]:
Disbursed_lg=lg.predict_proba(x_validate)

In [21]:
#Printing accuracy for the model
fpr, tpr, _ = roc_curve(y_validate, Disbursed_lg[:,1])
roc_auc = auc(fpr, tpr)
print roc_auc

0.699618843745


In [22]:
#Running Random Forest on train data
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
disbursed = rf.predict_proba(x_validate)
fpr, tpr, _ = roc_curve(y_validate, disbursed[:,1])
roc_auc = auc(fpr, tpr)
print roc_auc

0.783534827094


In [23]:
#Running random forest with different parameters
rfo = RandomForestClassifier(n_estimators=1000, 
                             max_features='auto',
                             oob_score=True,
                             random_state=1)
rfo.fit(x_train, y_train)
disbursed = rfo.predict_proba(x_validate)
fpr, tpr, _ = roc_curve(y_validate, disbursed[:,1])
roc_auc = auc(fpr, tpr)
print roc_auc

0.853211027481


In [24]:
# compute the feature importances
pd.DataFrame({'feature':feature_cols, 'importance':rf.feature_importances_})

Unnamed: 0,feature,importance
0,RevolvingUtilizationOfUnsecuredLines,0.191921
1,age,0.126553
2,NumberOfTime30-59DaysPastDueNotWorse,0.057723
3,DebtRatio,0.180117
4,MonthlyIncome,0.146129
5,NumberOfOpenCreditLinesAndLoans,0.08857
6,NumberOfTimes90DaysLate,0.092629
7,NumberRealEstateLoansOrLines,0.033556
8,NumberOfTime60-89DaysPastDueNotWorse,0.037651
9,NumberOfDependents,0.045152


In [26]:
# compute the out-of-bag classification accuracy
rfo.oob_score_

0.93548530444288414

In [27]:
disbursed

array([[ 0.648,  0.352],
       [ 0.97 ,  0.03 ],
       [ 0.985,  0.015],
       ..., 
       [ 0.93 ,  0.07 ],
       [ 0.969,  0.031],
       [ 0.593,  0.407]])

In [28]:
#Saving the predict values into "Output"
output = rf.predict(x_validate)

In [29]:
output

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
#Getting test data from csv file
credit_test  = pd.read_csv('cs-test.csv')

In [32]:
x_test = credit_test[feature_cols]


In [33]:
credit_test.head(10)

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1
5,6,,0.509791,63,0,0.342429,4140.0,4,0,0,0,1
6,7,,0.587778,50,0,1048.0,0.0,5,0,0,0,3
7,8,,0.046149,79,1,0.36917,3301.0,8,0,1,0,1
8,9,,0.013527,68,0,2024.0,,4,0,1,0,0
9,10,,1.0,23,98,0.0,0.0,0,98,0,98,0


In [34]:
x_test.head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.885519,43,0,0.177513,5700.0,4,0,0,0,0
1,0.463295,57,0,0.527237,9141.0,15,0,4,0,2
2,0.043275,59,0,0.687648,5083.0,12,0,1,0,2
3,0.280308,38,1,0.925961,3200.0,7,0,2,0,0
4,1.0,27,0,0.019917,3865.0,4,0,0,0,1
5,0.509791,63,0,0.342429,4140.0,4,0,0,0,1
6,0.587778,50,0,1048.0,0.0,5,0,0,0,3
7,0.046149,79,1,0.36917,3301.0,8,0,1,0,1
8,0.013527,68,0,2024.0,,4,0,1,0,0
9,1.0,23,98,0.0,0.0,0,98,0,98,0


In [35]:
credit_test.isnull().sum() 

Unnamed: 0                                   0
SeriousDlqin2yrs                        101503
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
NumberOfTime30-59DaysPastDueNotWorse         0
DebtRatio                                    0
MonthlyIncome                            20103
NumberOfOpenCreditLinesAndLoans              0
NumberOfTimes90DaysLate                      0
NumberRealEstateLoansOrLines                 0
NumberOfTime60-89DaysPastDueNotWorse         0
NumberOfDependents                        2626
dtype: int64

Filling null values with the mean value of its respective column

In [36]:
credit_test.MonthlyIncome.fillna(credit_test.MonthlyIncome.mean(), inplace=True)    

In [37]:
credit_test.NumberOfDependents.fillna(credit_test.NumberOfDependents.mean(), inplace=True)    

In [38]:
x_test = credit_test[feature_cols]

In [40]:
#Saving into Output_test the predicted values for test data 
output_test = rf.predict(x_test)

In [41]:
output_test

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
print output_test

[0 0 0 ..., 0 0 0]


In [43]:
#Saving probabilities into Pro
Prob_Test = rf.predict_proba(x_test)

In [44]:
print Prob_Test

[[ 1.   0. ]
 [ 1.   0. ]
 [ 1.   0. ]
 ..., 
 [ 1.   0. ]
 [ 0.9  0.1]
 [ 1.   0. ]]


In [None]:
#Saving the Prob_Test data into a dataframe - Probability1 = 0 and Probabillity2 = 1
df = pd.DataFrame(Prob_Test, columns = ['Probability1','Probability2'])


In [46]:
df

Unnamed: 0,Probability1,Probability2
0,1.00,0.00
1,1.00,0.00
2,1.00,0.00
3,1.00,0.00
4,0.80,0.20
5,0.90,0.10
6,0.80,0.20
7,0.90,0.10
8,1.00,0.00
9,0.77,0.23


In [None]:
#Saving data into csv file
df.to_csv('Output_test.csv')

##Trying grid search with more parameters

In [None]:
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

#Getting X features and Y target feature

#n_samples = len(digits.images)
X = credit_rf[feature_cols]
y = credit_rf.SeriousDlqin2yrs

# Split the dataset in two equal parts
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train1, y_train1)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test1, clf.predict(X_test1)
    print(classification_report(y_true, y_pred))
    print()

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision