In [8]:
"""
This is the final verison of cleaning the data
"""
import pyreadr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdates
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix

# **First, we clean the unwanted data**

In [16]:
#drop the columns that logically has no relation with acceptance rate
df = pd.read_csv('/Users/kiwibird/StudyFile/2022Spring/Datathon/track.csv')
df.drop(columns =['state_code','county_code','msa','census_tract','denial_reason','lei',
                  'applicant_age_above_62','sex','aus','conforming_loan_limit','purchaser_type'
                 ,'discount_points','interest_rate','lender_credits','total_loan_costs','race',
                  'construction_method','ethnicity','loan_type','lien_status','occupancy_type'], inplace = True)
#drop the rows that has NA
df.dropna(subset=['income','debt_to_income_ratio','combined_loan_to_value_ratio','property_value'],inplace=True)

In [17]:
df.accepted = df.accepted.astype('bool')

# Then we encode all non-numerical values

In [18]:
#Ordinal applicant_age
df['applicant_age'].unique()


array(['45-54', '55-64', '65-74', '25-34', '35-44', '>74', '<25'],
      dtype=object)

In [19]:
age_sort = ['<25','25-34', '35-44','45-54', '55-64', '65-74', '>74']
age_enc = OrdinalEncoder(categories=[age_sort])
df['applicant_age'] = age_enc.fit_transform(df['applicant_age'].values.reshape(-1,1))

In [20]:
# Ordinal debt_to_income_ratio
df['debt_to_income_ratio'].unique()


array(['>60%', '41%-<46%', '36%-<41%', '20%-<30%', '46%-<50%', '30%-<36%',
       '<20%', '50%-60%'], dtype=object)

In [21]:
debt_to_income_sort = ['<20%','20%-<30%', '30%-<36%','36%-<41%','41%-<46%', '46%-<50%',
        '50%-60%','>60%']
debt_to_income_enc = OrdinalEncoder(categories=[debt_to_income_sort])
df['debt_to_income_ratio'] = debt_to_income_enc.fit_transform(df['debt_to_income_ratio'].values.reshape(-1,1))

**See the correlation**

In [22]:
columns = df.columns
for i in columns:
    cleaned_df = df.dropna(subset=[str(i)])
    print("The correlation of ",i," with accepted is: ", df[str(i)].corr(df['accepted']))

The correlation of  year  with accepted is:  0.0080960733424657
The correlation of  applicant_age  with accepted is:  -0.04658008636365649
The correlation of  income  with accepted is:  0.0019681608247615646
The correlation of  debt_to_income_ratio  with accepted is:  -0.20327093104312555
The correlation of  tract_one_to_four_family_homes  with accepted is:  0.011788633099217901
The correlation of  tract_median_age_of_hu  with accepted is:  -0.01873419286493982
The correlation of  tract_minority_pop_percent  with accepted is:  -0.07742771938369988
The correlation of  tract_population  with accepted is:  -0.00015726529375702372
The correlation of  tract_to_msa_income_percentage  with accepted is:  0.058183130609042594
The correlation of  ffiec_msa_md_median_fam_income  with accepted is:  0.004361259040702438
The correlation of  balloon_payment  with accepted is:  -0.01336691158162228
The correlation of  business_or_commercial_purpose  with accepted is:  -0.021855196854123224
The correla

**Delete columns that has less relationship with accept**

In [None]:
df.drop(columns=['year','income','tract_population','ffiec_msa_md_median_fam_income','combined_loan_to_value_ratio'],inplace=True)

In [28]:
#create train and test and validation value
X = df.drop(labels = ['accepted'],axis=1)
y = df['accepted']

# Split test and train data

In [29]:
# create testing dataset
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=0)

# Try to oversample the data

In [30]:
from imblearn.over_sampling import SMOTE

In [31]:
#Oversample using SMOTE
sm = SMOTE(sampling_strategy='minority')
X_resample, y_resample = sm.fit_resample(X_train, y_train)

In [32]:
#Export the data after smote() as a .csv

In [33]:
smote_df = X_resample
smote_df['accepted'] = y_resample
smote_df.to_csv("dataset.csv")