# Experiments
This is where all data cleaning, feature selection, feature engineering, model building, and model evaluation will happen.

In [92]:
import pandas as pd
import numpy as np
import ydata_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from typing import List

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

## Functions
Here I will define all the functions that are to be used in this project. There will be 2 categories of functions:
1) Data cleaning functions: These are functions that help handle inconsistencies as discovered in analysis.ipynb.
2) Flagging functions: These are functions that return a list of ids of people who have potentially violated some logic as also discovered in analysis.ipynb.

In [None]:
def clean_age(df: pd.DataFrame, age_threshold: int = 100) -> pd.DataFrame:
    df.loc[df['person_age'] > age_threshold, 'person_age'] -= age_threshold
    return df

def clean_emp_length_by_age(df: pd.DataFrame, age_threshold: int = 100) -> pd.DataFrame:
    violaters = df[df['person_emp_length']>=100].index
    df.drop(violaters, axis=0, inplace=True)
    return df

def clean_underage_or_impossible_labor(df: pd.DataFrame, min_age_of_employment: int) -> pd.DataFrame:
    # Find the age of when the applicant first started working
    df['age_of_first_emp'] = df['person_age'] - df['person_emp_length']

    # Find applicants that started working before the min age and take the difference
    diff = 14 - df[df['age_of_first_emp'] < min_age_of_employment]['age_of_first_emp']

    # Subtract diff from person_emp_length
    df.loc[diff.index, 'person_emp_length'] = df.loc[diff.index, 'person_emp_length'] - diff

    # Verify results
    df['age_of_first_emp'] = df['person_age'] - df['person_emp_length']
    if len(df[df['age_of_first_emp']<min_age_of_employment]) != 0:
        raise ValueError(f"There exist an applicant that started working before they turned {min_age_of_employment}")

    # Drop age_of_first_emp
    df.drop("age_of_first_emp", axis=1, inplace=True)

    return df

def clean_loan_pct_income(df: pd.DataFrame) -> pd.DataFrame:
    df['loan_percent_income'] = round(df['loan_amnt'] / df['person_income'], 2)
    return df

def clean_credit_history(df: pd.DataFrame, min_age_for_credit: int) -> pd.DataFrame:
    # Find age of when the applicant started their credit history
    df['years_from_min_credit_age'] = df['person_age'] - min_age_for_credit

    # Find applicant that started their credit history prior to the min age and take the difference
    diff = df[df['years_from_min_credit_age'] < df['cb_person_cred_hist_length']]

    # Assign years_from_min_credit_age to cb_person_cred_hist_length
    df.loc[diff.index, 'cb_person_cred_hist_length'] = df.loc[diff.index, 'years_from_min_credit_age']

    # Verify results
    if len(df[(df['person_age'] - df['cb_person_cred_hist_length']) < min_age_for_credit]) != 0:
        raise ValueError(f"There exist an applicant that started their credit history before they turned {min_age_for_credit}")

    # Drop years_from_min_credit_age
    df.drop("years_from_min_credit_age", axis=1, inplace=True)

    return df

def flag_no_emp_high_income(df: pd.DataFrame) -> List[int]:
    return list(df[(df['person_emp_length']==0) & (df['person_income']>0)]['id'])

def flag_min_emp_age_violation(df: pd.DataFrame) -> List[int]:
    return df[df['person_age'] - df['person_emp_length'] < 14]['id']

def flag_inconsistent_loan_percentage(df: pd.DataFrame) -> List[int]:
    actual_loan_percent_income = round(df['loan_amnt'] / df['person_income'], 2)
    inconsistent_percentages_df = df.iloc[df['loan_percent_income'].compare(actual_loan_percent_income).index]

    return inconsistent_percentages_df['id']

def flag_credit_history_violation(df: pd.DataFrame, min_age_for_credit: int) -> List[int]:
    df['years_from_min_age'] = df['person_age'] - min_age_for_credit
    credit_history_violaters_df = df[df['years_from_min_age'] < df['cb_person_cred_hist_length']]

    df.drop("years_from_min_age", axis=1, inplace=True)

    return credit_history_violaters_df