In [3]:
path="/content/drive/My Drive/Colab Notebooks/"


In [None]:
# coding: utf-8

# # Home Credit Default Risk

# ## Predicting how capable each applicant is of repaying a loan?

# ![home%20credit.jpg](attachment:home%20credit.jpg)

# Introduction: Many people struggle to get loans due to insufficient or non-existent credit histories. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.
# 
# Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.
# 
# While Home Credit is currently using various statistical and machine learning methods to make these predictions, they're challenging Kagglers to help them unlock the full potential of their data. Doing so will ensure that clients capable of repayment are not rejected and that loans are given with a principal, maturity, and repayment calendar that will empower their clients to be successful.
# 
# https://www.kaggle.com/c/home-credit-default-risk
# 
# The objective of this competition is to use historical loan application data to predict whether or not an applicant will be able to repay a loan. This is a standard supervised classification task:
# 
# Supervised: The labels are included in the training data and the goal is to train a model to learn to predict the labels from the features.
# 
# Classification: The label is a binary variable, 0 (will repay loan on time), 1 (will have difficulty repaying loan)

# ## Import necessary libraries.

# In[2]:


# import numpy for math calculations
import numpy as np

# import pandas for data (csv) manipulation
import pandas as pd

# import matplotlib for plotting
import matplotlib.pyplot as plt

# import seaborn for more plotting options(built on top of matplotlib)
import seaborn as sns

import pandas as pd
# Supress unnecessary warnings so that the presentation looks clean
import warnings
warnings.filterwarnings("ignore")

# display plots on the notebook itself
get_ipython().magic('matplotlib inline')


# ## Read the data files

# In[3]:


train = pd.read_csv(path+"application_train.csv")
test = pd.read_csv(path+"application_test.csv")


# In[4]:


train.info()


# ## How is the statistic?

# In[5]:


train.describe()


# ## How are the target labels spread?

# In[5]:


sns.countplot(train.TARGET)


# In[6]:


train['TARGET'].value_counts()


# ### This is clearly an imbalanced target. There are more number of people who returned - 0 as opposed to people who had difficulties -1. About 91.92 % of applicants repayed!

# ## What are the dimensions of Train and Test dataset?

# In[7]:


print("The train dataset dimensions are as follows: {}".format(train.shape))
print("The test dataset dimensions are as follows: {}".format(test.shape))


# ## Look at the train dataset

# In[8]:


train.head()


# ## Look at the test dataset

# In[9]:


test.head()


# ## Look at the New Test dataset

# In[10]:





# #### As expected, test dataset contains all the columns except the target label.

# ## What are the missing values and their column names?

# In[11]:


def missing_columns(dataframe):
    """
    Returns a dataframe that contains missing column names and 
    percent of missing values in relation to the whole dataframe.
    
    dataframe: dataframe that gives the column names and their % of missing values
    """
    
    # find the missing values
    missing_values = dataframe.isnull().sum().sort_values(ascending=False)
    
    # percentage of missing values in relation to the overall size
    missing_values_pct = 100 * missing_values/len(dataframe)
    
    # create a new dataframe which is a concatinated version
    concat_values = pd.concat([missing_values, missing_values/len(dataframe),missing_values_pct.round(1)],axis=1)

    # give new col names
    concat_values.columns = ['Missing Count','Missing Count Ratio','Missing Count %']
    
    # return the required values
    return concat_values[concat_values.iloc[:,1]!=0]
    


# In[12]:


missing_columns(train)


# In[13]:


missing_columns(test)


# In[14]:



# We will have to handle these missing values (known as imputation). Other option would be to drop all those columns where there are large number of missing values. Unless we know the feature importance, it is not possible to make a call on which columns to keep which ones to drop.

# ## What are the different datatypes of columns? - How many floats, integers, categoricals?

# In[15]:


print("Train dataset: \n{}".format(train.dtypes.value_counts()))
print()
print("Test dataset: \n{}".format(test.dtypes.value_counts())) 
print()


# #### Turn every column data type of testing set similar to training set. Match datatypes of test in alignment with train. 

# In[6]:


def match_dtypes(training_df,testing_df,target_name='TARGET'):
    """
    This function converts dataframe to match columns in accordance with the 
    training dataframe.
    """
    for column_name in training_df.drop([target_name],axis=1).columns:
         testing_df[column_name]= testing_df[column_name].astype(train[column_name].dtype)
        
    return testing_df
    


# In[7]:



# In[18]:


print("Train dataset: \n{}".format(train.dtypes.value_counts()))
print()
print("Test dataset: \n{}".format(test.dtypes.value_counts())) 
print()


# ### In test dataset, 40 int64 indicates that the target label is missing - which is obvious.

# ### What are the different kinds of classes in every categorical column?

# In[19]:


# Number of unique classes in each object column
train.select_dtypes('object').apply(pd.Series.nunique)


# In[20]:


test.select_dtypes('object').apply(pd.Series.nunique)


# In[21]:



# ## Handling Categorical variables - Label Encoding and One Hot Encoding.

# Some machine learning models can't learn if provided with text categories. The categorical variables are to be converted into
# numerical equivalent, which is done by Label encoding and One hot encoding.
# 
# <b>Label encoding:</b> It is the process of assigning each unique category in a categorical variable with an integer. No new columns are created. 

# ![label_encoding.png](attachment:label_encoding.png)

# In[8]:


# Create a label encode object having less than or equal to 2 unique values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
transform_counter = 0

# iterate through all the categorical columns
for col in train.select_dtypes('object').columns:
    
    # select only those columns where number of unique values in the category is less than or equal to 2 
    if pd.Series.nunique(train[col]) <= 2:
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.fit_transform(test[col].astype(str))

        transform_counter+=1
        
print("Label encoded {} columns.".format(transform_counter))    


# <b>One-hot encoding:</b> create a new column for each unique category in a categorical variable. Each observation recieves a 1 in the column for its corresponding category and a 0 in all other new columns.

# ![one%20hot1.jpg](attachment:one%20hot1.jpg)
# Credit : https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f

# In[9]:


# one-hot encode of categorical variables
train = pd.get_dummies(train,drop_first=True)
test = pd.get_dummies(test,drop_first=True)



# One hot encoding would added more columns, checking how many there are: 

# In[10]:


print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)


# There is a mismatch in the count of columns for test and train. This can be fixed by aligning them.

# In[11]:


# collect the target labels to support the aligning 

target = train['TARGET']


# ## Ensure train and test have the same number of columns by aligning.

# In[12]:


train, test = train.align(test,axis=1,join='inner')


# Add the stored target column back into the train dataset.

# In[13]:


train['TARGET'] = target


# Since there are extra columns in the training set and those columns are missing in the new_testing set, let us add those columns and assign them to dummy value of 0.

# In[14]:


def match_columns(training_set,testing_set,target_label='TARGET'):
    """Matches the count of columns from training set to testing set by adding extra cols and setting them to 0."""
    
    for column in training_set.drop([target_label],axis=1).columns:
        if column not in testing_set.columns:
            testing_set[column]=0
    
    return testing_set        


# In[15]:





# In[16]:


print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)



# <h3>On the look for Anomalies</h3> 
# </br>
# 
# One problem we always want to be on the lookout for is anomalies within the data. These may be due to mis-typed numbers, errors in measuring equipment, or they could be valid but extreme measurements. One way to support anomalies checking is by looking at the statistics of a column using the describe method. The numbers in the DAYS_BIRTH column are negative because they are recorded relative to the current loan application. To see these stats in years, we can multiply by -1 and divide by the number of days in a year:

# ## How old are clients?

# In[31]:


(train['DAYS_BIRTH']/-365).describe()


# Ages seem to be fine, nothing in particluar seems to be off.

# In[32]:


fig, ax = plt.subplots(figsize =(12,7))
sns.distplot(train['DAYS_BIRTH']/-365,bins=5,kde=False)
plt.xlabel("Age of the client (Years)")


# People in the age range 30-40 years are the most applicants. Which seems pretty normal.

# ### How many years has it been since the applicant started working? 
# The DAYS_EMPLOYED column is negative because the days are relative only to the time of the application. -ve means so many days since the application, the client has been working. +ve means, the client is about to work in those many days. In an ideal world, the -ve has significance, +ve could mean anything from client starts working to client can be fired and resumes working, which in anyway doesn't make sense because the loan might not be given to those clients without any work.

# In[33]:


(train['DAYS_EMPLOYED']/365).describe()


# This doesn't seem right, the maximum value (besides being positive) is about 1000 years!

# ### Who are these special people who got employed 1000 years after issuance of the loan? 

# In[34]:


fig, ax = plt.subplots(figsize=(12,7))
sns.distplot(train['DAYS_EMPLOYED']/365,kde=False)
plt.xlabel("Time before the loan application the persons started current employment(in years)")


# So, how many of these 1000 year anomalies?

# In[17]:


# find the number of records where DAYS_EMPLOYED is between [900,1100] years. 
thousand_anomalies = train[(train['DAYS_EMPLOYED']/365>=900) & (train['DAYS_EMPLOYED']/365<=1100)]
len(thousand_anomalies)


# ## Lets look their ability to repay.

# In[36]:


fig, ax = plt.subplots(figsize=(12,7))
sns.countplot(x='TARGET',data=thousand_anomalies)


# ## Most anomalies were able to repay on time. But how can they be contrasted in relation to non anomalies?

# In[18]:


# get the index of anomalies and non anomalies
anomalies_index = pd.Index(thousand_anomalies.index)
non_anomalies_index = train.index.difference(anomalies_index)


# In[19]:


# get the anomalies records
non_anomalies = train.iloc[non_anomalies_index]


# In[20]:


# get the anomaly targets
anomalies_target = thousand_anomalies['TARGET'].value_counts()
non_anomalies_target = non_anomalies['TARGET'].value_counts()


# In[21]:


# find the default rate for anomalies and non anomalies

print("Anomalies have a default rate of {}%".format(100*anomalies_target[1]/(anomalies_target[1]+anomalies_target[0])))
print("Non Anomalies have a default rate of {}%".format(100*non_anomalies_target[1]/(non_anomalies_target[1]+non_anomalies_target[0])))


# So surprisingly anomalies have lesser default rate!

# Handling the anomalies depends on the exact situation, with no set rules. One of the safest approaches is just to set the anomalies to a missing value and then have them filled in (using Imputation) before machine learning. In this case, since all the anomalies have the exact same value, we want to fill them in with the same value in case all of these loans share something in common. The anomalous values seem to have some importance, so we want to tell the machine learning model if we did in fact fill in these values. As a solution, we will fill in the anomalous values with not a number (np.nan) and then create a new boolean column indicating whether or not the value was anomalous.

# In[22]:


# Create an anomalous flag column
train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace({365243: np.nan})


# In[23]:


# Looking at the years employed for anomalies

plt.figure(figsize=(12,8))
(train['DAYS_EMPLOYED']/-365).plot.hist(title = 'Years Employment Histogram')
plt.xlabel("Years worked before application")


# Now it all seems normal!

# In[24]:


# Create an anomalous flag column
test['DAYS_EMPLOYED_ANOM'] = test["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].replace({365243: np.nan})

# Create an anomalous flag column



# ## Finding out the most correlated features for the TARGET variable. 

# ## Understanding Correlation
# 
# Correlation is a statistical measure that indicates the extent to which two or more variables fluctuate together. A positive correlation indicates the extent to which those variables increase or decrease in parallel; a negative correlation indicates the extent to which one variable increases as the other decreases.
# 
# A correlation coefficient is a statistical measure of the degree to which changes to the value of one variable predict change to the value of another. When the fluctuation of one variable reliably predicts a similar fluctuation in another variable, there’s often a tendency to think that means that the change in one causes the change in the other. However, correlation does not imply causation. There may be, for example, an unknown factor that influences both variables similarly.
# 
# ![correlation.png](attachment:correlation.png)
# 
# To describe the strength of the
# correlation using the guide that Evans (1996) suggests for the absolute value of r:
# <br/>
#  .00-.19 “very weak”
#  <br/>
#  .20-.39 “weak”
#  <br/>
#  .40-.59 “moderate”
#  <br/>
#  .60-.79 “strong”
#  <br/>
#  .80-1.0 “very strong”
# 
# 
# 
# http://www.statstutor.ac.uk/resources/uploaded/pearsons.pdf <br/>
# https://whatis.techtarget.com/definition/correlation

# In[25]:


corr_train = train.corr()['TARGET']


# ## Looking at the top 10 most positively and negatively correlated features we get:

# In[26]:


print(corr_train.sort_values().tail(10))
corr_train.sort_values().head(10)


# ### Since EXT_SOURCE_3, EXT_SOURCE_2, EXT_SOURCE_1 and DAYS_BIRTH are highly correlated (Relatively), let us also explore the possibility of having them as interaction variables.
# 

# ## Initially filling up the missing values for the most correlated variables.

# In[27]:




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
The train dataset dimensions are as follows: (307511, 122)
The test dataset dimensions are as follows: (48744, 121)
Train dataset: 
float64    65
int64      41
object     16
dtype: int64

Test dataset: 
float64    65
int64      40
object     16
dtype: int64

Train dataset: 
float64    65
int64      41
object     16
dtype: int64

Test dataset: 
float64    65
int64      40
object     16
dtype: int64

Label encoded 4 columns.
Training Features shape:  (307511, 230)
Testing Features shape:  (48744, 226)
Training Features shape:  (307511, 227)
Testing Features shape:  (48744, 226)
Anomalies have a default rate of 5.399646043269405%
Non Anomalies have a default rate of 8.659974537652149%


In [None]:
from sklearn.impute import SimpleImputer


# In[28]:


poly_fitting_vars = ['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1','DAYS_BIRTH']

imputer = SimpleImputer(missing_values=np.nan, strategy='median')


# In[30]:

train[poly_fitting_vars]=imputer.fit_transform(train[poly_fitting_vars])

# In[31]:


train[poly_fitting_vars].shape

# In[32]:

test[poly_fitting_vars]=imputer.fit_transform(test[poly_fitting_vars])


# In[33]:


test[poly_fitting_vars].shape


from sklearn.preprocessing import PolynomialFeatures


# In[37]:


poly_feat = PolynomialFeatures(degree=4)


# #### I also tried for polynomial degree of order 10. Couldn't find much improvement from degree 4 to 10. Fun fact: order of 10 created over 1000 interaction variables! 

# In[39]:


poly_interaction_train = poly_feat.fit_transform(train[poly_fitting_vars])


# In[40]:


poly_interaction_train.shape


# In[41]:


poly_interaction_test = poly_feat.fit_transform(test[poly_fitting_vars])


# In[42]:


poly_interaction_test.shape


# In[43]:





# In[44]:


# ## Build a dataframe out of interaction variables only!

# In[45]:


poly_interaction_train = pd.DataFrame(poly_interaction_train,columns=poly_feat.get_feature_names(poly_fitting_vars))


# In[46]:


poly_interaction_train.shape


# In[47]:


poly_interaction_test =  pd.DataFrame(poly_interaction_test,columns=poly_feat.get_feature_names(poly_fitting_vars))


# In[48]:


poly_interaction_test.shape



# ## Add the 'TARGET' column which is later used for looking up correlations with the interaction variables.

# In[51]:


poly_interaction_train['TARGET'] = train['TARGET']


# In[52]:


interaction = poly_interaction_train.corr()['TARGET'].sort_values()


# ## Which are the most correlated interaction variables?

# In[53]:


# looking at the top 15 most positive and negative correlated interaction variables.
print(interaction.tail(15))
(interaction.head(15))


# ## Get the names of the columns which have the highest correlation - '1' and 'TARGET' can be dropped.

# In[54]:


set(interaction.head(15).index).union(interaction.tail(15).index).difference(set({'1','TARGET'}))


# ## Choose the selected columns which have highest correlation to 'TARGET'. Columns '1' and 'TARGET' are not necessary!

# In[55]:


selected_inter_variables = list(set(interaction.head(15).index).union(interaction.tail(15).index).difference(set({'1','TARGET'})))


# In[56]:


# look at the selected features
poly_interaction_train[selected_inter_variables].head()


# In[57]:


poly_interaction_test[selected_inter_variables].head()


# In[58]:





# ## Get a list of unselected columns that are to be dropped.

# In[59]:


unselected_cols = [element for element in poly_interaction_train.columns if element not in selected_inter_variables]


# ##  Drop the unselected columns of the interaction dataframes - train and test versions both.

# In[60]:


poly_interaction_train = poly_interaction_train.drop(unselected_cols,axis=1)


# In[61]:


poly_interaction_test = poly_interaction_test.drop(list(set(unselected_cols).difference({'TARGET'})),axis=1)


# In[62]:





# ## Merge polynomial features into the original dataframes using their indices.

# #### Dropping columns 'EXT_SOURCE_2' and 'EXT_SOURCE_3' since they're already present in the source dataset.

# In[63]:


train = train.join(poly_interaction_train.drop(['EXT_SOURCE_2', 'EXT_SOURCE_3'],axis=1))


# In[64]:


test = test.join(poly_interaction_test.drop(['EXT_SOURCE_2', 'EXT_SOURCE_3'],axis=1))


# In[65]:



# ## What are their merged dataframe dimensions?

# In[66]:


print("The train dataset dimensions are as follows: {}".format(train.shape))
print("The test dataset dimensions are as follows: {}".format(test.shape))
train['DIR'] = train['AMT_CREDIT']/train['AMT_INCOME_TOTAL']
train['AIR'] = train['AMT_ANNUITY']/train['AMT_INCOME_TOTAL']
train['ACR'] = train['AMT_ANNUITY']/train['AMT_CREDIT']
train['DAR'] = train['DAYS_EMPLOYED']/train['DAYS_BIRTH']


# In[68]:


test['DIR'] = test['AMT_CREDIT']/test['AMT_INCOME_TOTAL']
test['AIR'] = test['AMT_ANNUITY']/test['AMT_INCOME_TOTAL']
test['ACR'] = test['AMT_ANNUITY']/test['AMT_CREDIT']
test['DAR'] = test['DAYS_EMPLOYED']/test['DAYS_BIRTH']

# In[69]:



# ## Look at the correlation of the newly added variables in relation to the 'TARGET'

# In[70]:


corr_vals = train.corr()['TARGET']


# In[71]:


corr_vals.tail(4)


# ## Hmmm, not much correlation - Linear!

# # Preparing the dataset for feeding into the model.

# ## Feature Imputing

# Feature imputation is the process of filling up missed/NAN values for those columns where 
# certain cells are not filled by default due to reasons such as outlier replacement / unavailable data 
# or incorrect entires during capturing the data.   

# In[72]:

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer


# In[73]:


features = list(set(train.columns).difference({'TARGET'}))


# Imputation is done for the median value of every column.

# In[74]:


imputer = SimpleImputer(strategy="median")


# ## Feature Scaling
# 
# Feature scaling is a method used to standardize the range of independent variables or features of data. 
# In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.
#  
# 
# Since the range of values of raw data varies widely, in some machine learning algorithms, objective functions will not work properly without normalization. 
# For example, the majority of classifiers calculate the distance between two points by the Euclidean distance. If one of the features has a broad range of values, the distance will be governed by this particular feature. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.
# Another reason why feature scaling is applied is that gradient descent converges much faster with feature scaling than without it.

# In[76]:


scaler = MinMaxScaler(feature_range = (0, 1))


# In[77]:


train_transformed = imputer.fit_transform(train.drop(['TARGET'],axis=1))

test_transformed = imputer.transform(test)

train_transformed = scaler.fit_transform(train_transformed)


# In[82]:


test_transformed = scaler.transform(test_transformed)


# In[83]:



# In[84]:


# new_test[new_test.isnull().any(axis=1)]


# In[85]:


print("The train dataset dimensions are as follows: {}".format(train_transformed.shape))
print("The test dataset dimensions are as follows: {}".format(test_transformed.shape))



# In[89]:


from sklearn.decomposition import PCA
pca = PCA(n_components=2)


# # Split the dataset into training set and validation set

# In[104]:


from sklearn.model_selection import train_test_split

X_training_set, X_validation_set, y_training_set, y_validation_set = train_test_split(train_transformed, 
                                                                                      target, test_size=0.33, random_state=42)

In [None]:
# Starting with Logistic Regression.

from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(C = 2)


# In[106]:


logistic_regressor.fit(X_training_set,y_training_set)


# In[107]:


log_regression_pred = logistic_regressor.predict(X_validation_set)


# In[165]:


# In[180]:
# # Understanding Accuracy metrics
# 
# <b>1.True Positives (TP):</b> True positives are the cases when the actual class of the data point was 1(True) and the predicted is also 1(True)
# Ex: The case where a person is actually having cancer(1) and the model classifying his case as cancer(1) comes under True positive.
# 
# <b>2.True Negatives (TN):</b> True negatives are the cases when the actual class of the data point was 0(False) and the predicted is also 0(False
# 
# Ex: The case where a person NOT having cancer and the model classifying his case as Not cancer comes under True Negatives.
# 
# <b>3.False Positives (FP):</b> False positives are the cases when the actual class of the data point was 0(False) and the predicted is 1(True). False is because the model has predicted incorrectly and positive because the class predicted was a positive one. (1)
# 
# Ex: A person NOT having cancer and the model classifying his case as having cancer comes under False Positives.
# 
# <b>4.False Negatives (FN):</b> False negatives are the cases when the actual class of the data point was 1(True) and the predicted is 0(False). False is because the model has predicted incorrectly and negative because the class predicted was a negative one. (0)
# 
# Ex: A person having cancer and the model classifying his case as No-cancer comes under False Negatives.
# 
# ### Minimization and Trade offs :
# 
# We know that there will be some error associated with every model that we use for predicting the true class of the target variable. This will result in False Positives and False Negatives(i.e Model classifying things incorrectly as compared to the actual class).
# 
# There’s no hard and fast rule that says what should be minimised in all the situations. It purely depends on the business needs and the context of the problem you are trying to solve. Based on that, we might want to minimise either False Positives or False negatives.
# 
# ### Accuracy:
# Accuracy in classification problems is the number of correct predictions made by the model over all kinds predictions made.
# ![accuracy.png](attachment:accuracy.png)
# 
# 
# ### Precision:
# Precision talks about how precise/accurate the model is out of those predicted positive, how many of them are actual positive.
# ![precision.png](attachment:precision.png)
# 
# 
# ### Recall - True Positive Rate:
# What percent of the positive cases did the model catch (predicted positive) amongst all positive cases. Recall actually calculates how many of the Actual Positives our model capture through labeling it as Positive.
# ![recall.png](attachment:recall.png)
# 
# 
# ### False Positive Rate:
# <b>False Positive Rate = False Positives / (False Positives + True Negatives) </b>
# 
# 
# ### F-1 Score:
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. It considers both the precision p and the recall r of the test to compute the score: p is the number of correct positive results divided by the number of all positive results returned by the classifier, and r is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive). The F1 score is the harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
# ![f-1%20score.png](attachment:f-1%20score.png)
# 
# 
# ### ROC (receiver operating characteristic) Curve:
# A curve of true positive rate vs. false positive rate at different classification thresholds.
# 
# ### AUROC (Area under ROC):
# An evaluation metric that considers all possible classification thresholds.
# 
# The Area Under the ROC curve is the probability that a classifier will be more confident that a randomly chosen positive example is actually positive than that a randomly chosen negative example is positive.
# 
# image source : https://medium.com/greyatom/performance-metrics-for-classification-problems-in-machine-learning-part-i-b085d432082b
# 
# https://en.wikipedia.org/wiki/F1_score
# 
# https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

# In[109]:


from sklearn.metrics import accuracy_score,classification_report, roc_auc_score
print("The accuracy in general is : ", accuracy_score(y_validation_set,log_regression_pred))
print("\n")
print("The classification report is as follows:\n", classification_report(y_validation_set,log_regression_pred))
print("ROC AUC score is: ",roc_auc_score(y_validation_set,log_regression_pred))


# We want to predict the probabilty of not paying a loan, so we use the model predict.proba method. 
# This returns an m x 2 array where m is the number of datapoints.
# The first column is the probability of the target being 0 and the second column is the probability of the 
# target being 1. We want the probability the loan is not repaid, so we will select the second column.

# In[110]:


log_regression_pred_test = logistic_regressor.predict_proba(test_transformed)


# In[111]:


# selecting the second column
log_regression_pred_test[:,1]


# In[112]:


submission_log_regression = test[['SK_ID_CURR']]
submission_log_regression['TARGET'] = log_regression_pred_test[:,1]


# In[113]:


submission_log_regression.head(10)


# In[114]:


submission_log_regression.to_csv("log_regression.csv",index=False)