# Part 3

### ALWAYS run this code below first 

In [2]:
import pandas as pd
import os
from sys import platform
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pickle
import seaborn

# Helper functions
def is_integer(x):
   
    try:
        return (int(x) == float(x))
    except:
        return False

def ingest_files(directory: str):
    
    
    # If the directory has no trailing slash, add one
    if directory[-1] != "/":
        directory = directory + "/"
    
    all_files = os.listdir(directory)
    output = {}
    
    print("Directory " + directory + " has " + str(len(all_files)) + " files:")
    for i in all_files:
        print(" Reading file " + i)
        output[i] = pd.read_csv(directory + i, dtype = str, skiprows = 1)
        
        # To remove those lines, find any lines with non-integer IDs
        # and remove them
        invalid_rows = (output[i].id.apply( lambda x : is_integer(x) == False ))
        if invalid_rows.sum() > 0:
            print(" Found " + str(invalid_rows.sum()) + " invalid rows which were removed")
            output[i] = output[i][invalid_rows == False]
    
    return output

def clean_perc(x):
    if pd.isnull(x):
        return np.nan
    else:
        return float(x.strip()[:-1])

def clean_date(x):
    if pd.isnull(x):
        return None
    else:
        return datetime.datetime.strptime( x, "%b-%Y").date()

def visualize_columns(data, float_cols, cat_cols, date_cols):
    '''
    This function visualizes all columns
      - Box-and-whisker plots for continuous variables
      - Lists of distinct values for categorical columns
      - A timeline density for dates
    '''
    
    # FLoat columns
    for i in float_cols:
        seaborn.boxplot(data[i])

        # Print the three highest values
        highest_vals = sorted(data[i], reverse=True)[:3]
        smallest_val = min(data[i])
        plt.text(smallest_val, -0.3, highest_vals[0])
        plt.text(smallest_val, -0.2, highest_vals[1])
        plt.text(smallest_val, -0.1, highest_vals[2])

        plt.show()
        
    # Categorical columns 
    for i in cat_cols:
        print(i)
        print(str(len(set(data[i]))) + " distinct values")
        print(data[i].value_counts())
        print("")
        print("")
    
    # Date columns
    for i in date_cols:
        data[data[i].isnull() == False][i].apply(lambda x : str(x.year) +
                                                "-" + str(x.month)).value_counts(ascending = True).plot()
        plt.title(i + " (" + str(data[i].isnull().sum()) + " null values)")
        plt.show()

# Identify the columns we'll be keeping from the dataset
cols_to_pick = ["id",
                "loan_amnt",
                "funded_amnt",
                "revol_util",
                "revol_bal",
                "fico_range_low",
                "fico_range_high",
                "pub_rec",
                "open_acc",
                "earliest_cr_line",
                "delinq_2yrs",
                "dti",
                "purpose",
                "issue_d",
                "annual_inc",
                "home_ownership",
                "emp_length",
                "grade",
                "term",
                "int_rate",
                "installment",
                "verification_status",
                "recoveries",
                "loan_status",
                "last_pymnt_d",
                "total_pymnt"]

# Identify the type of each of these column

float_cols = ["loan_amnt",
                "funded_amnt",
                "revol_bal",
                "fico_range_low",
                "fico_range_high",
                "pub_rec",
                "open_acc",
                "delinq_2yrs",
                "dti",
                "annual_inc",
                "installment",
                "recoveries",
                "total_pymnt"]

cat_cols = ["purpose",
            "home_ownership",
            "emp_length",
            "grade",
            "term",
            "verification_status",
            "loan_status"]

perc_cols = ["revol_util",
             "int_rate"]

date_cols = ["earliest_cr_line",
             "issue_d",
             "last_pymnt_d"]


# Ensure that we have types for every column
assert set(cols_to_pick) - set(float_cols) - set(cat_cols) - set(perc_cols) - set(date_cols) == set(["id"])

# Define the names of the four returns we'll be calculating
ret_cols = ["ret_PESS", "ret_OPT", "ret_INTa", "ret_INTb", "ret_INTc"]

# Some of the columns selected will not be used directly in the model, but will be used to generate other features.
# Create variables specifying the features that will be used

# All categorical columns other than "loan_status" will be used as
# discrete features
discrete_features = list(set(cat_cols) - set(["loan_status"]))

# All numeric columns will be used as continuous features
continuous_features = list(float_cols + perc_cols)

# Question 1

# Question 2

# Question 3