# Defining settings

In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setting up GitHub
import os
from getpass import getpass #Secure token storage

# Username and email:
os.system("git config --global user.name 'AJLR888'")
os.system("git config --global user.email 'roldan.analytics@gmail.com'")

# Storing GitHub token and repository details
GITHUB_TOKEN = getpass("Enter GitHub Token:")
REPO_OWNER = "AJLR888"
REPO_NAME = "hmda_ny_2017_preprocessing"
BRANCH_NAME = "main"

#Setting GitHub remot URL with authentcation
GIT_REMOTE_URL = f"https://{GITHUB_TOKEN}@github.com/{REPO_OWNER}/{REPO_NAME}.git"
os.system(f"git remote set-url origin {GIT_REMOTE_URL}")



Enter GitHub Token:··········


32768

In [3]:
# Importing working space
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#Load dataset
%ls "/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv"

'/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv'


# EDA

In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/hmda_2007_ny_all-records_labels.csv')

df = pd.DataFrame(df)


## Sample

In [None]:
df_sample = df.sample(n=12000, random_state=42)

print(df.shape, '\n')


In [None]:
pd.set_option('display.max_rows', None)

print(df_sample.isnull().sum())

In [None]:
print(df_sample.dtypes)

In [None]:
pd.set_option('display.max_columns', None)
print(df_sample.head(5))

## Selection of only "No Co-aplicant" records

In [None]:
df = df[df['co_applicant_ethnicity_name'] == 'No co-applicant']

In [None]:
print(df.dtypes)

## Selection of relevant features

In [None]:
df = df[[
    "loan_type_name",
    "property_type_name",
    "loan_purpose_name",
    "loan_amount_000s",
    "action_taken_name",
    "msamd_name",
    "census_tract_number",
    "applicant_ethnicity_name",
    "applicant_race_name_1",
    "applicant_sex_name",
    "applicant_income_000s",
    "denial_reason_name_1",
    "rate_spread",
    "lien_status_name",
    "hud_median_family_income",
    "tract_to_msamd_income"
]]

In [None]:
print(df.dtypes)

In [None]:
print(df.shape)

In [None]:
print(df.describe())

# Data Cleaning

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
  print(f"Value counts for: {col}")
  print(df[col].value_counts(), "\n")

## Excluding: irrelevant records from the following columns:

*   applicant_ethnicity_name
*   applicant_race_name_1
*   applicant_sex_name
*   action_taken_name





In [None]:
df = df[
    ~df["action_taken_name"].isin([
        "File closed for incompleteness",
        "Preapproval request denied by financial institution",
        "Preapproval request approved but not accepted",
        "Application withdrawn by applicant"
    ]) &
    ~df["applicant_ethnicity_name"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ]) &
    ~df["applicant_race_name_1"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ]) &
    ~df["applicant_sex_name"].isin([
        "Information not provided by applicant in mail, Internet, or telephone application",
        "Not applicable"
    ])
]


In [None]:
print(df.shape)

## Addressing missing values

In [None]:
print(df.isnull().sum())

In [None]:
df = df.assign(
    msamd_name=df['msamd_name'].fillna("Unknown"),
    denial_reason_name_1=df['denial_reason_name_1'].fillna("Unknown"),
    rate_spread=df['rate_spread'].fillna(0)
)

df = df.dropna(subset=['hud_median_family_income', 'tract_to_msamd_income', 'applicant_income_000s'])


In [None]:
print(df.shape)
print(df.isnull().sum())

# Creation of new columns


## ethnicity_race_sex

In [None]:

df['ethnicity_race_sex'] = df['applicant_ethnicity_name'].str.lower() + "_" + df['applicant_race_name_1'].str.lower() + "_" + df['applicant_sex_name'].str.lower()

# Checking column created
print(df[['ethnicity_race_sex']].value_counts())

In [None]:
print("test")

In [None]:
print("this is another test")

# Commit to GitHub

In [None]:
#!git push --force origin main


In [None]:
#!git log --oneline --graph --decorate -n 5


In [None]:
#!git ls-files --others --exclude-standard


In [None]:
%cd "/content/drive/My Drive/Colab Notebooks/hmda_ny_2007_preprocessing/"

In [None]:
#!git add .
#!git commit -m "Adding missing files"
#!git push origin main

In [None]:
#!git init #Run only once.

In [None]:
#!git branch -M main


In [None]:
!git status


In [None]:
#!git remote add origin https://github.com/AJLR888/hmda-ny-2007-loan-default.git (only run once)

In [None]:
#!git remote -v

In [None]:
!git add ny-2007-data-preprocesing.ipynb

#Error message:fatal: pathspec 'ny_2007_data-preprocesing.ipynb' did not match any files


In [None]:
#Solution, check if the file exists in the current directory using the code !ls -l  File didn't appear as I changed the name.
#!ls -l

In [None]:
# We need to update the name: !git mv old_filename.ipynb new_filename.ipynb
#!git mv ny_2007_data-preprocesing.ipynb.ipynb new_filename.ipynb

In [None]:
!git commit -m "Updated preprocessing script"

In [None]:
!git push origin main

In [None]:
#!git fetch origin main
#!git reset --hard origin/main


In [None]:
!git remote remove origin

In [None]:
#from getpass import getpass

#GITHUB_TOKEN = getpass("Enter Token: ")

#!git remote add origin https://{GITHUB_TOKEN}@github.com/AJLR888/hmda-ny-2007-loan-default.git

In [None]:
#!git remote -v

In [None]:
#!git push origin main

In [None]:
!git pull origin main --allow-unrelated-histories


In [None]:
!git push --force origin main


In [None]:
!git config --global credential.helper store