In [1]:
%pylab

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


In [None]:
plot(arange(5))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

In [None]:
df = pd.read_csv("//OHFS07/Home/dbasak/My Documents/Analytics Vidhya/Loans Data/train.csv") #Reading the dataset in a dataframe using Pandas

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df['Property_Area'].value_counts()

In [None]:
df['ApplicantIncome'].hist(bins=50)

In [None]:
df.boxplot(column='ApplicantIncome', by='Education')

In [None]:
df['LoanAmount'].hist(bins=50)

In [None]:
df.boxplot(column='LoanAmount')

In [None]:
return_type='dict'

In [None]:
temp1 = df['Credit_History'].value_counts(ascending=True)

In [None]:
temp1

In [None]:
temp2 = df.pivot_table(values='Loan_Status', index=['Credit_History'],aggfunc=lambda x: x.map({'Y':1,'N':0}).mean())

In [None]:
print ('Frequency Table for Credit History:')
print (temp1)
print ('\nProbability of getting loan for each credit class:')
print (temp2)

In [None]:
fig = plt.figure(figsize=(8,4))

ax1 = fig.add_subplot(121)
ax1.set_xlabel('Credit_History')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by Credit_History")
temp1.plot(kind='bar')


ax2 = fig.add_subplot(122)
temp2.plot(kind = 'bar')
ax2.set_xlabel('Credit_History')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by credit history")

In [None]:
temp3 = pd.crosstab(df['Credit_History'], df['Loan_Status'])
temp3.plot(kind='bar', stacked=True, color=['red','blue'], grid=False)

In [None]:
#Data Munging - Check for missing values
df.apply(lambda x: sum(x.isnull()),axis=0) 

In [None]:
#Imputing missing value for Self-Employed
#A key hypothesis is that whether a person is educated or self-employed can combine to give a good estimate of loan amount.

#First lets see the distribution of loan amount by Education and Self-Employed
df.boxplot(column='LoanAmount', by=['Education','Self_Employed'])


In [None]:
#Hence we can see that the median varies between all 4 groups and this can be used to impute values for loanAmount.
#However, as we saw earlier, Self_Employed itself has missing values. We need to impute that first.

#Let’s look at the frequency table of Self_Employed
df['Self_Employed'].value_counts()

In [None]:
#Since ~86% values are “No”, it is safe to impute the missing values as “No” as there is a high probability of success.
df['Self_Employed'].fillna('No',inplace=True)

In [None]:
#Now, we will create a Pivot table, which provides us median values for all the groups of unique values of Self_Employed and Education features
table = df.pivot_table(values='LoanAmount', index='Self_Employed', columns='Education', aggfunc=np.median)

#define a function to return value of this pivot table

def fage(x):
    return table.loc[x['Self_Employed'],x['Education']]

#replace missing values

df['LoanAmount'].fillna(df[df['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)

In [None]:
#How to treat outliers in distribution of LoanAmount and ApplicantIncome
#Since the extreme values are practically possible, i.e. some people might apply for high value loans due to specific needs. 
#So instead of treating them as outliers, let’s try a log transformation to nullify their effect

df['LoanAmount_log'] = np.log(df['LoanAmount'])
df['LoanAmount_log'].hist(bins=20)

In [None]:
#Coming to ApplicantIncome. One intuition can be that some applicants have lower income but strong support Co-applicants. 
#So it might be a good idea to combine both incomes as total income and take a log transformation of the same

df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['TotalIncome_log'] = np.log(df['TotalIncome'])
df['LoanAmount_log'].hist(bins=20) 

In [None]:
#building Predictive Model
#Converting categorical columns into numeric columns 

var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i].astype('str'))
df.dtypes 

In [None]:
#Generic function for making a classification model and accessing performance:

def classification_model(model, data, predictors, outcome):
    #Fit the model:
    model.fit(data[predictors],data[outcome])
  
    #Make predictions on training set:
    predictions = model.predict(data[predictors])
  
    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,data[outcome])
    print ("Accuracy : %s" %  "{0:.3%}".format(accuracy))

    #Perform k-fold cross-validation with 5 folds
    kf = KFold(data.shape[0], n_folds=5)
    error = []
    for train, test in kf:
    
        # Filter training data
        train_predictors = data[predictors].iloc[train,:]
    
        # The target we're using to train the algorithm.
        train_target = data[outcome].iloc[train]
    
        # Training the algorithm using the predictors and target.
        model.fit(train_predictors, train_target)
    
        #Record error from each cross-validation run
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    
    print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

    #Fit the model again so that it can be refered outside the function:
    model.fit(data[predictors],data[outcome]) 

In [1]:
# Logistic regression model:

outcome_var = 'Loan_Status'
#model = LogisticRegression()
#predictor_var = ['Credit_History']
#classification_model(model, df,predictor_var,outcome_var)