# Defining settings

In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setting up GitHub
import os
from getpass import getpass #Secure token storage

# Username and email:
os.system("git config --global user.name 'AJLR888'")
os.system("git config --global user.email 'roldan.analytics@gmail.com'")

# Storing GitHub token and repository details
GITHUB_TOKEN = getpass("Enter GitHub Token:")
REPO_OWNER = "AJLR888"
REPO_NAME = "hmda_ny_2017_preprocessing"
BRANCH_NAME = "main"

#Setting GitHub remot URL with authentcation
GIT_REMOTE_URL = f"https://{GITHUB_TOKEN}@github.com/{REPO_OWNER}/{REPO_NAME}.git"
os.system(f"git remote set-url origin {GIT_REMOTE_URL}")



Enter GitHub Token:··········


32768

In [3]:
# Importing working space
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#Load dataset
%ls "/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv"

'/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv'


# EDA

In [5]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv')

df = pd.DataFrame(df)


  df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv')


## Sample

In [6]:
df_sample = df.sample(n=12000, random_state=42)

print(df.shape, '\n')


(1009451, 78) 



In [7]:
pd.set_option('display.max_rows', None)

print(df_sample.isnull().sum())

as_of_year                            0
respondent_id                         0
agency_name                           0
agency_abbr                           0
agency_code                           0
loan_type_name                        0
loan_type                             0
property_type_name                    0
property_type                         0
loan_purpose_name                     0
loan_purpose                          0
owner_occupancy_name                  0
owner_occupancy                       0
loan_amount_000s                      0
preapproval_name                      0
preapproval                           0
action_taken_name                     0
action_taken                          0
msamd_name                         1087
msamd                              1087
state_name                            0
state_abbr                            0
state_code                            0
county_name                          11
county_code                          11


In [8]:
print(df_sample.dtypes)

as_of_year                          int64
respondent_id                      object
agency_name                        object
agency_abbr                        object
agency_code                         int64
loan_type_name                     object
loan_type                           int64
property_type_name                 object
property_type                       int64
loan_purpose_name                  object
loan_purpose                        int64
owner_occupancy_name               object
owner_occupancy                     int64
loan_amount_000s                    int64
preapproval_name                   object
preapproval                         int64
action_taken_name                  object
action_taken                        int64
msamd_name                         object
msamd                             float64
state_name                         object
state_abbr                         object
state_code                          int64
county_name                       

In [9]:
pd.set_option('display.max_columns', None)
print(df_sample.head(5))

        as_of_year respondent_id                                  agency_name  \
400869        2007    0000018039                 Office of Thrift Supervision   
25128         2007    0000501105                       Federal Reserve System   
73622         2007    0001881185                       Federal Reserve System   
637936        2007    4216200005  Department of Housing and Urban Development   
570628        2007    56-0811711    Office of the Comptroller of the Currency   

       agency_abbr  agency_code loan_type_name  loan_type  \
400869         OTS            4   Conventional          1   
25128          FRS            2   Conventional          1   
73622          FRS            2   Conventional          1   
637936         HUD            7   Conventional          1   
570628         OCC            1   Conventional          1   

                                       property_type_name  property_type  \
400869  One-to-four family dwelling (other than manufa...             

## Selection of only "No Co-aplicant" records

In [10]:
df = df[df['co_applicant_ethnicity_name'] == 'No co-applicant']

In [11]:
print(df.dtypes)

as_of_year                          int64
respondent_id                      object
agency_name                        object
agency_abbr                        object
agency_code                         int64
loan_type_name                     object
loan_type                           int64
property_type_name                 object
property_type                       int64
loan_purpose_name                  object
loan_purpose                        int64
owner_occupancy_name               object
owner_occupancy                     int64
loan_amount_000s                    int64
preapproval_name                   object
preapproval                         int64
action_taken_name                  object
action_taken                        int64
msamd_name                         object
msamd                             float64
state_name                         object
state_abbr                         object
state_code                          int64
county_name                       

## Selection of relevant features

In [12]:
df = df[[
    "loan_type_name",
    "property_type_name",
    "loan_purpose_name",
    "loan_amount_000s",
    "action_taken_name",
    "msamd_name",
    "census_tract_number",
    "applicant_ethnicity_name",
    "applicant_race_name_1",
    "applicant_sex_name",
    "applicant_income_000s",
    "denial_reason_name_1",
    "rate_spread",
    "lien_status_name",
    "hud_median_family_income",
    "tract_to_msamd_income"
]]

In [13]:
print(df.dtypes)

loan_type_name               object
property_type_name           object
loan_purpose_name            object
loan_amount_000s              int64
action_taken_name            object
msamd_name                   object
census_tract_number         float64
applicant_ethnicity_name     object
applicant_race_name_1        object
applicant_sex_name           object
applicant_income_000s       float64
denial_reason_name_1         object
rate_spread                 float64
lien_status_name             object
hud_median_family_income    float64
tract_to_msamd_income       float64
dtype: object


In [14]:
print(df.shape)

(601274, 16)


In [15]:
print(df.describe())

       loan_amount_000s  census_tract_number  applicant_income_000s  \
count     601274.000000        600501.000000          558351.000000   
mean         257.485403          1434.016365             115.918044   
std          425.800429          2516.739575             215.812326   
min            1.000000             1.000000               1.000000   
25%           76.000000           132.000000              50.000000   
50%          176.000000           374.000000              80.000000   
75%          368.000000          1351.010000             123.000000   
max        93000.000000          9929.000000            9999.000000   

        rate_spread  hud_median_family_income  tract_to_msamd_income  
count  46312.000000             600473.000000          600246.000000  
mean       5.119319              66569.953853             106.660651  
std        1.658133              14311.443551              46.883337  
min        3.000000              50900.000000               5.050000  
25%  

# Data Cleaning

In [16]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
  print(f"Value counts for: {col}")
  print(df[col].value_counts(), "\n")

Value counts for: loan_type_name
loan_type_name
Conventional          579399
FHA-insured            19791
VA-guaranteed           1697
FSA/RHS-guaranteed       387
Name: count, dtype: int64 

Value counts for: property_type_name
property_type_name
One-to-four family dwelling (other than manufactured housing)    590101
Manufactured housing                                               7094
Multifamily dwelling                                               4079
Name: count, dtype: int64 

Value counts for: loan_purpose_name
loan_purpose_name
Refinancing         288173
Home purchase       237915
Home improvement     75186
Name: count, dtype: int64 

Value counts for: action_taken_name
action_taken_name
Loan originated                                        231939
Application denied by financial institution            179429
Application withdrawn by applicant                      61678
Application approved but not accepted                   56103
Loan purchased by the institution          

## Excluding: irrelevant records from the following columns:

*   applicant_ethnicity_name
*   applicant_race_name_1
*   applicant_sex_name
*   action_taken_name





In [17]:
df = df[
    ~df["action_taken_name"].isin([
        "File closed for incompleteness",
        "Preapproval request denied by financial institution",
        "Preapproval request approved but not accepted",
        "Application withdrawn by applicant"
    ]) &
    ~df["applicant_ethnicity_name"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ]) &
    ~df["applicant_race_name_1"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ]) &
    ~df["applicant_sex_name"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ])
]


In [18]:
print(df.shape)

(410858, 16)


## Addressing missing values

In [19]:
print(df.isnull().sum())

loan_type_name                   0
property_type_name               0
loan_purpose_name                0
loan_amount_000s                 0
action_taken_name                0
msamd_name                   36465
census_tract_number            483
applicant_ethnicity_name         0
applicant_race_name_1            0
applicant_sex_name               0
applicant_income_000s        22625
denial_reason_name_1        313621
rate_spread                 372712
lien_status_name                 0
hud_median_family_income       508
tract_to_msamd_income          638
dtype: int64


In [20]:
df = df.assign(
    msamd_name=df['msamd_name'].fillna("Unknown"),
    denial_reason_name_1=df['denial_reason_name_1'].fillna("Unknown"),
    rate_spread=df['rate_spread'].fillna(0)
)

df = df.dropna(subset=['hud_median_family_income', 'tract_to_msamd_income', 'applicant_income_000s'])


In [21]:
print(df.shape)
print(df.isnull().sum())

(387610, 16)
loan_type_name              0
property_type_name          0
loan_purpose_name           0
loan_amount_000s            0
action_taken_name           0
msamd_name                  0
census_tract_number         0
applicant_ethnicity_name    0
applicant_race_name_1       0
applicant_sex_name          0
applicant_income_000s       0
denial_reason_name_1        0
rate_spread                 0
lien_status_name            0
hud_median_family_income    0
tract_to_msamd_income       0
dtype: int64


# Creation of new columns


## ethnicity_race_sex

In [22]:

df['ethnicity_race_sex'] = df['applicant_ethnicity_name'].str.lower() + "_" + df['applicant_race_name_1'].str.lower() + "_" + df['applicant_sex_name'].str.lower()

# Checking column created
print(df[['ethnicity_race_sex']].value_counts())

ethnicity_race_sex                                                     
not hispanic or latino_white_male                                          152266
not hispanic or latino_white_female                                         99758
not hispanic or latino_black or african american_female                     33890
not hispanic or latino_black or african american_male                       27917
hispanic or latino_white_male                                               23592
not hispanic or latino_asian_male                                           16565
hispanic or latino_white_female                                             14471
not hispanic or latino_asian_female                                         10629
not hispanic or latino_american indian or alaska native_male                 1325
hispanic or latino_black or african american_male                            1199
not hispanic or latino_native hawaiian or other pacific islander_male        1152
hispanic or latino_black o

In [23]:
print("test")

test


In [24]:
print("this is another test")

this is another test


# Commit to GitHub

In [25]:
#!git push --force origin main


In [26]:
#!git log --oneline --graph --decorate -n 5


In [27]:
#!git ls-files --others --exclude-standard


In [28]:
%cd "/content/drive/My Drive/Colab Notebooks/hmda_ny_2007_preprocessing/"

/content/drive/My Drive/Colab Notebooks/hmda_ny_2007_preprocessing


In [29]:
#!git add .
#!git commit -m "Adding missing files"
#!git push origin main

In [49]:
#!git init #Run only once.

Reinitialized existing Git repository in /content/drive/MyDrive/Colab Notebooks/hmda_ny_2007_preprocessing/.git/


In [31]:
#!git branch -M main


In [32]:
!git status


Refresh index:  66% (2/3)Refresh index: 100% (3/3)Refresh index: 100% (3/3), done.
On branch main
Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mdeleted:    ny_2007_data_preprocesing.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mny-2007-data-preprocesing.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [46]:
#!git remote add origin https://github.com/AJLR888/hmda-ny-2007-loan-default.git (only run once)

error: remote origin already exists.


In [47]:
!git remote -v

origin	https://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@github.com/AJLR888/hmda-ny-2007-loan-default.git (fetch)
origin	https://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@github.com/AJLR888/hmda-ny-2007-loan-default.git (push)


In [50]:
!git add ny_2007_data-preprocesing.ipynb

#Error message:fatal: pathspec 'ny_2007_data-preprocesing.ipynb' did not match any files


fatal: pathspec 'ny_2007_data-preprocesing.ipynb' did not match any files


In [51]:
!ls -l

total 75
-rw------- 1 root root    28 Mar 14 18:54 data_preprocessing
-rw------- 1 root root 18092 Mar 14 18:54 LICENSE
-rw------- 1 root root 56840 Mar 14 19:20 ny-2007-data-preprocesing.ipynb


In [36]:
#!git commit -m "Updated preprocessing script"

In [37]:
#!git push origin main

In [38]:
#!git fetch origin main
#!git reset --hard origin/main


In [39]:
#!git remote remove origin

In [40]:
##from getpass import getpass

#GITHUB_TOKEN = getpass("Enter Token: ")

#!git remote add origin https://{GITHUB_TOKEN}@github.com/AJLR888/hmda-ny-2007-loan-default.git

In [41]:
#!git remote -v

In [42]:
#!git push origin main

In [43]:
#!git pull origin main --allow-unrelated-histories


In [44]:
#!git push --force origin main


In [45]:
#!git config --global credential.helper store