# src

## Data

### Download

#### Libraries

In [1]:
import pandas as pd
import gdown  # to download data from google drive
import gzip   # to decompress downloaded data
import shutil
from scipy import stats
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
def download_data(folder_id):
    '''
    Downloads all content in a given google drive folder.
    
    Args:
    folder_id (str): google drive folder id which the content is to be downloaded from.
    '''
    print("downloading")
    url = f"https://drive.google.com/drive/folders/{folder_id}"
    # gdown URL formate
    gdown.download_folder(url=url, output="./downloaded_folder", quiet=False, use_cookies=False)

In [3]:
def decompress_gz(file_path, output_file_name):
    '''
    Decompresses given gzip file.
    
    Args:
    file_path (str): path of gzip file to be decompressed.
    output_file_name (str): name to be save of file after decompression.
    '''
    print("decompressing")
    with gzip.open(file_path, 'rb') as f_in:
        with open(output_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [4]:
def raw_data_downloading(file_path, downloaded_directory, decompressed_file_name):
    '''
    Downloads raw dataset.
    
    Args:
    file_path (str): path of dataset gzip file on google drive.
    downloaded_directory (str): path of dataset gzip file after download.
    decompressed_file_name (str): name to be save of file after decompression.
    '''
    download_data(file_path)
    decompress_gz(downloaded_directory, decompressed_file_name)

### Cleaning

In [5]:
def reading_data_into_dataframe(file_name):
    '''
    Saves raw dataset as a dataframe.
    
    Args:
    file_path (str): Dataset directory.

    Returns:
    dataframe: created dataframe
    '''
    print("reading")
    accepted_df = pd.read_csv(file_name, low_memory=False)
    return accepted_df

In [6]:
def high_null_drop(accepted_df, important_cols):
    '''
    Drops unimportant columns with high null count.
    
    Args:
    accepted_df (dataframe): Dataset dataframe.
    important_cols (list(str)): Columns of importance with high null count to avoid dropping.
    '''
    print("col high null drop")
    number_of_entries = accepted_df.shape[0] 
    for col in accepted_df.columns.tolist():
        if col in important_cols:
            continue
        tot_col_nulls = accepted_df[col].isna().sum()
        if tot_col_nulls >= (0.2 * number_of_entries):
            accepted_df.drop(col, axis=1, inplace=True)

In [7]:
def fix_null(accepted_df, col_2b_fixed, col_used_to_fix):
    '''
    Adjusts null values of a given column to 0 based on another related column.
    
    Args:
    accepted_df (dataframe): Dataset dataframe.
    col_2b_fixed (str): Name of column that needs adjustement.
    col_used_to_fix (str): Name of column used in fixing errored column.
    '''
    print("fixing delinq")
    wrong_col_index = accepted_df.columns.get_loc(col_2b_fixed)
    fixer_col_index = accepted_df.columns.get_loc(col_used_to_fix)
    for i in range (accepted_df.shape[0]):
        if accepted_df.iat[i, fixer_col_index] == 0 and accepted_df.iat[i, wrong_col_index]:
            accepted_df.iat[i, wrong_col_index] = 0

In [8]:
def drop_excessive_nulls(accepted_df):
    '''
    Drops all rows with null values.
    
    Args:
    accepted_df (dataframe): Dataset dataframe.
    '''
    print("nulled rows drop")
    for col in accepted_df.columns.tolist():
        accepted_df = accepted_df[accepted_df[col].notna()]
    return accepted_df

In [9]:
def drop_unecessary_cols(accepted_df, columns):
    '''
    Drops columns unecessary for prediction.
    
    Args:
    accepted_df (dataframe): Dataset dataframe.
    columns (list(str)): Columns' names that are unecessary.
    '''
    print("unnecessary cols drop")
    for col in columns:
        if col in accepted_df.columns.tolist():
            accepted_df.drop(col, axis=1, inplace=True)

In [10]:
def encoding_target_var(df):
    '''
    Encodes the target variable categories into "1" for good credit & "0" for bad credit.
    
    Args:
    df (dataframe): Dataset dataframe.
    '''
    print("encoding trgt var")
    df['loan_status'] = df['loan_status'].replace('Current', "1")
    df['loan_status'] = df['loan_status'].replace('Fully Paid', "1")
    df['loan_status'] = df['loan_status'].replace('Charged Off', "0")
    df['loan_status'] = df['loan_status'].replace('Late (31-120 days)', "0")
    df['loan_status'] = df['loan_status'].replace('In Grace Period', "1")
    df['loan_status'] = df['loan_status'].replace('Late (16-30 days)', "1")
    df['loan_status'] = df['loan_status'].replace('Default', "0")

In [11]:
def sampling(df):
    '''
    Randomly chooses rows with target "1" to match number of target "0" instances.
    
    Args:
    df (dataframe): Dataset dataframe.

    Returns:
    dataframe: Adjusted dataframe.
    '''
    print("sampling")
    subset_good = df[df['loan_status'] == "1"].sample(n=df[df['loan_status'] == "0"].shape[0], 
                                                      random_state=737) 
    subset_bad = df[df['loan_status'] == "0"] 
    df = pd.concat([subset_good, subset_bad])
    df = df.sample(frac = 1)
    df.reset_index(inplace=True)
    return df

In [12]:
def outlier_removal(df):
    '''
    Determines outliers using z-score then drop them.
    
    Args:
    df (dataframe): Dataset dataframe.

    Returns:
    dataframe: Adjusted dataframe.
    '''
    print("removing outliers")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col != 'loan_status']
    
    z_scores = np.abs(stats.zscore(df[numeric_cols]))
    
    threshold_z = 3  # thershold to determine outlier
    
    outlier_rows = (z_scores > threshold_z).any(axis=1)
    
    df = df[~outlier_rows]  # removal of outliers
    
    df.reset_index(drop=True, inplace=True)
    
    return df

In [13]:
def standardizing(df):
    '''
    Standardizing all numeric columns of dataframe.
    
    Args:
    df (dataframe): Dataset dataframe.
    '''
    print("standardizing")
    scaler = StandardScaler()
    for col in df.columns:
        if df[col].dtype != 'O' and col != "loan_status":
            col_array = np.array(df[col])
            col_array = col_array.reshape(-1, 1)
            scaler.fit(col_array)
            df[col] = scaler.transform(col_array)

In [14]:
def label_encoding(df):
    '''
    Label encodes all object columns of dataframe.
    
    Args:
    df (dataframe): Dataset dataframe.
    '''
    print("label encoding")
    le = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'O':
            le.fit(df[col])
            df[col] = le.transform(df[col])

In [15]:
def data_cleaning(csv_file):
    '''
    Performs all data cleaning.

    Args:
    csv_file (str): CSV file of dataset

    Returns:
    dataframe: Cleaned dataframe.
    '''
    df = reading_data_into_dataframe(csv_file)
    high_null_drop(df, ["mths_since_last_delinq", "all_util"])
    fix_null(df, "mths_since_last_delinq", "delinq_amnt")
    df = drop_excessive_nulls(df)
    drop_unecessary_cols(df, ["acceptD", "application_type", "creditPullD", "desc", 
                   "emp_title", "expD", "id", "listD", "mthsSinceMostRecentInq",
                   "reviewStatusD", "title", "url", "zip_code", "sec_app_inq_last_6mths"])
    encoding_target_var(df)
    df = sampling(df)
    df = outlier_removal(df)
    standardizing(df)
    label_encoding(df)
    df.drop("index", axis=1, inplace=True)
    df = df[["last_pymnt_d", "total_rec_prncp", "last_pymnt_amnt", "out_prncp", 
         "total_rec_late_fee", "last_fico_range_high", "installment", "loan_amnt", 
         "total_rec_int", "out_prncp_inv", "total_pymnt", "funded_amnt_inv", 
         "recoveries", "debt_settlement_flag", "hardship_flag", "mo_sin_old_rev_tl_op", 
         "revol_util", "dti", "all_util", "annual_inc", "loan_status"]]   # features as selected from ُEDA
    
    return df

In [16]:
raw_data_downloading("14ZG8utOf0Ry76w_T9rqrk5kdhhPIK84P", '/kaggle/working/downloaded_folder/accepted_2007_to_2018Q4.csv.gz',
                "accepted.csv")
df = data_cleaning("accepted.csv")
print(df.shape)
df.head()

downloading


Retrieving folder contents


Retrieving folder 1UGhBnl-1KamEMI_jeSrYZC6Zp7HtmTKf extras
Processing file 1X-OG1OToiQEcHXcBMWWlS0t83FwdzsKm decision_tree.png
Processing file 1thyWynkaoLHijDDfSxD_SEWSbOMONvai accepted_2007_to_2018Q4.csv.gz
Processing file 1DiuIw9k_kMdaPu_DFyXj3yaLjd_wRxx4 df_cleaned.csv


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1X-OG1OToiQEcHXcBMWWlS0t83FwdzsKm
To: /kaggle/working/downloaded_folder/extras/decision_tree.png
100%|██████████| 282k/282k [00:00<00:00, 75.4MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1thyWynkaoLHijDDfSxD_SEWSbOMONvai
From (redirected): https://drive.google.com/uc?id=1thyWynkaoLHijDDfSxD_SEWSbOMONvai&confirm=t&uuid=275cc20a-64f5-47c8-8756-b04def49177a
To: /kaggle/working/downloaded_folder/accepted_2007_to_2018Q4.csv.gz
100%|██████████| 393M/393M [00:01<00:00, 298MB/s] 
Downloading...
From (original): https://drive.google.com/uc?id=1DiuIw9k_kMdaPu_DFyXj3yaLjd_wRxx4
From (redirected): https://drive.google.com/uc?id=1DiuIw9k_kMdaPu_DFyXj3yaLjd_wRxx4&confirm=t&uuid=e9165509-9730-495c-887b-19acc6b78efe
To: /kaggle/working/downloaded_folder/df_cleaned.csv
100%|██████████| 157M/157M [00:00<00:00, 274MB/s] 
Downl

decompressing
reading
col high null drop
fixing delinq
nulled rows drop
unnecessary cols drop
encoding trgt var
sampling
removing outliers


  z_scores = np.abs(stats.zscore(df[numeric_cols]))
  outlier_rows = (z_scores > threshold_z).any(axis=1)


standardizing
label encoding
(119286, 21)


Unnamed: 0,last_pymnt_d,total_rec_prncp,last_pymnt_amnt,out_prncp,total_rec_late_fee,last_fico_range_high,installment,loan_amnt,total_rec_int,out_prncp_inv,...,funded_amnt_inv,recoveries,debt_settlement_flag,hardship_flag,mo_sin_old_rev_tl_op,revol_util,dti,all_util,annual_inc,loan_status
0,17,-0.332581,-0.372797,0.42008,-0.261895,0.281611,-0.473027,-0.461473,0.413942,0.420298,...,-0.461004,-0.445598,0,0,-0.307356,0.608376,0.558953,0.91582,-1.203816,0
1,31,-0.84056,-0.473603,-0.62922,-0.261895,1.244049,-1.608631,-1.60151,-1.081823,-0.629167,...,-1.601271,-0.445598,0,0,-0.187187,-1.880835,-1.233491,-0.049124,0.274631,1
2,27,-0.25072,-0.222228,2.263937,-0.261895,0.883135,1.252073,1.058575,-0.380182,2.264445,...,1.059353,-0.445598,0,0,-0.980297,0.431193,-0.223328,0.37974,2.227297,1
3,39,-0.458023,0.161605,-0.62922,-0.261895,0.161306,-1.364264,-1.373503,-0.909433,-0.629167,...,-1.373218,-0.445598,0,0,-1.472987,0.314511,-0.337836,0.433348,-1.231711,1
4,5,-0.415519,0.220569,-0.62922,-0.261895,0.221459,-1.317547,-1.348168,-0.83444,-0.629167,...,-1.347878,-0.445598,0,0,0.738107,1.273894,0.117928,1.559116,-0.841178,1


In [17]:
df.to_csv('df_cleaned.csv', index=False)