In [1]:
import os.path as path # used for easily finding the csvs in other directories
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from datetime import datetime

In [2]:
# loading csv from local directory function
def load_csv_within_directory(directory_string):
    """
    Takes in a string of where your csv is located in the repo folder and turns
    into a dataframe using pandas read_csv

    example directory string: '/data/raw_data/CIE/client_needs_table.csv'
    :param directory_string: string containing file with directory desired
    :return: a dataframe
    """

    temp_path = path.abspath(path.join("data" ,"../../..")) # finds the parent directory

    temp_directory_end = temp_path + directory_string # concatenates with directory for csvs
    temp_df = pd.read_csv(temp_directory_end) # creates df with csv
    return temp_df

In [12]:
# loading demographics for some basic wrangling
demographics = load_csv_within_directory("/data/raw_data/CIE/client_demographics_table.csv")

#quickly renaming columns
old_column_names = list(demographics.columns.values)

#doing quick list comprehension-based string replacing (blanks to _)
new_column_names = [name.replace(" ", "_") for name in old_column_names]
new_column_names = [name.replace("/", "__") for name in new_column_names]

#converting into quick dictionary for replacing old column names with new column names
rename_dict = {i:j for i,j in zip(old_column_names,new_column_names)}
demographics = demographics.rename(columns = rename_dict)

#converting columns to numeric
for column in demographics.columns:
    try:
        column = column.to_numeric()
    except:
        continue

i = 0 #index counter
for date in demographics.Account_Created_Date: #for each date within the report

    #convert the date to datetime #11/18/18 #remember uppercase is full 4 digits, lowercase is short 2 digit
    datetime_object = datetime.strptime(date, '%m/%d/%y')

    #replacing based on index
    demographics.at[i, 'Account_Created_Date'] = datetime_object
    i+=1
demographics.head()

# wrangling done. NaNs are left in for replication purposes,
# Now proceeding to cleaning/prepping data for modeling

Unnamed: 0,AccountID,Account_Created_Date,Record_Type,CIE_Consent,Zip,Neighborhood,HHSA_Region,County,Housing_Needs,Utilities_Needs,...,TANF,Rental_Assistance,Military__Veteran,Employment,Education,Health_Insurance,Health_Insurance_Type,Health_Plan,Homeless,Housing_Type
0,0012M000021P37VQAS,2018-11-18 00:00:00,Client,No Consent,92040.0,LAKESIDE,East,San Diego,No,Yes,...,Missing,Missing,Not Military/Veteran,Full-Time,Some College No Degree,Yes,Medi-Cal,Molina,Missing,Missing
1,0012M000021P9NyQAK,2018-11-18 00:00:00,Client,No Consent,92064.0,POWAY,North Inland,San Diego,Yes,Yes,...,No,No,Not Military/Veteran,Unable to work,Associate Degree,Yes,Medi-Cal,Other,Yes,Unsheltered
2,0012M000021Pa72QAC,2018-11-19 00:00:00,Client,Consent,92071.0,SANTEE,East,San Diego,No,Yes,...,Missing,Missing,Not Military/Veteran,Full-Time,High School Degree,Yes,Employer Provided,Missing,Missing,Missing
3,0012M000021Pa8AQAS,2018-11-19 00:00:00,Client,Consent,92110.0,"SD, OLD TOWN",North Central,San Diego,No,No,...,Missing,Missing,Not Military/Veteran,Missing,Missing,No,No Insurance,Missing,Yes,Homeless Unspecified
4,0012M000021PalPQAS,2018-11-19 00:00:00,Client,Consent,91910.0,CHULA VISTA,South,San Diego,No,Yes,...,Missing,Missing,Not Military/Veteran,Full-Time,Bachelor's Degree,Yes,Employer Provided,Missing,Missing,Missing


In [14]:
monthly_tenants = load_csv_within_directory("/data/raw_data/monthly_tenants_reached/monthly_tenants_reached.csv")
# will most likely need to normalize if logit is desired

Unnamed: 0,org,Sum_tenants_1,Sum_tenants_2,Sum_tenants_3,Sum_tenants_4,Sum_tenants_5,Sum_tenants_6,Sum_tenants_7,Sum_tenants_8,Sum_tenants_9,Sum_tenants_10,Sum_tenants_11,Sum_tenants_12
0,ACCE,,,1868.0,1255.0,1052.0,15924.0,834.0,4048.0,2650.0,764.0,3123.0,
1,Casa Familiar,84.0,261.0,80.0,279.0,628.0,634.0,1177.0,1034.0,504.0,536.0,329.0,207.0
2,Chicano Federation,65.0,311.0,217.0,106.0,95.0,64.0,78.0,77.0,66.0,137.0,356.0,315.0
3,Logan Heights CDC,,125.0,,,,,,,,,,
4,Refugee Coalition - Haitian Bridge Alliance,,,,,,50.0,,,,,,
5,Refugee Coalition - Horn of Africa,,,77.0,,1732.0,2513.0,2316.0,2042.0,1457.0,1470.0,1168.0,2033.0
6,Refugee Coalition - Karen organization of San ...,,,,,30.0,155.0,100.0,90.0,100.0,124.0,,
7,Refugee Coalition - Somali Bantu Community of ...,,,,,25.0,59.0,,85.0,,,,
8,Refugee Coalition - South Sudanese Community o...,,,,,,,,,,93.0,,
9,Refugee Coalition - Union of Pan Asian Communi...,,,,751.0,,50.0,556.0,,,,,


Possible Questions:
- How can we see whether legal representation can determine evictions?
- Are the workshops working? How well?

Models Desired Pertaining to Research Questions!
- (Legal) Combining Attendance of Workshops by Month to Dataframe with Monthly 311 Counts?
- Possible Time-Series Logistic Regression With Grouped Demographics/Location? (Change Over Time)
- Supervised Classification Of Housing_Need using High Dimensional Variables (Demographics)?
-

Random Thoughts
- Could we control for population and income by using Monthly Tenants Reached?
    - Possibly the income for the area for each workshop?
    - Do we have more Monthly Count Data (more years)? Otherwise, we may need to justify spreading aggregate
        - Or, scale based on GPS data from Google/Apple Maps? Or Other count-based data

Data Wrangling Tasks Found
1. Expand SDSC Court Cases (Unmasked)/Wrangle More Data
2. Acquire Sheriff Evictions
3. Workshop data needed (registrants, post-survey/overview)

Data Cleaning Tasks Found
1. Dummy/One-Hot All Categoricals
2. NaN check, if categorical, drop the row. If it's numerical, take the average or median
3. Normalize Tenant Counts (Log Transform)
4. Determine IVs to model with

---

Data reporting
Be super precise about the data reflects

-Court data
    - eviction filings
        - can get some additional information
- Sheriff data
    - landlord files efiction report
        - court rules in favor
            - Judge sets a lockout date
                - Landlord has to pay a fee
                -

-Try finding

-Look at Census Tracts for Eviction Data
    -Useful for GIS


-Workshop Registration
    -Person Interfaced with a website/etc
       -They received some amount of info about eviction prevention
        -Not everyone is at risk of eviction
      - Would need to control for time

Do we see any difference between zip codes where people who are registered for eviction prevention and those who were not?


In [6]:
court_data_CL_Masked = load_csv_within_directory("/data/raw_data/public_records_request_19Jan2023/CL_masked.csv")

In [7]:
court_data_CL_Masked

Unnamed: 0,CASE NUMBER,CITY,ZIP,MONTH,YEAR,DISPO TYPE,ANSWER,MONEY JUDGMENT,DEFAULT
0,37-2018-00000071-CL-UD-CTL,San Diego,92139,1,2018,Court-ordered dismissal,Y,N,N
1,37-2018-00000103-CL-UD-CTL,San Diego,92110,1,2018,Court-ordered dismissal,Y,N,N
2,37-2018-00000117-CL-UD-CTL,Vista,92084,1,2018,Request for Dismissal,N,N,Y
3,37-2018-00000125-CL-UD-CTL,SAN DIEGO,92117,1,2018,Court-ordered dismissal,N,N,Y
4,37-2018-00000137-CL-UD-CTL,Lakeside,92040,1,2018,Default judgment by clerk,N,N,Y
...,...,...,...,...,...,...,...,...,...
23119,37-2022-00052500-CL-UD-CTL,San Diego,92101,12,2022,,Y,N,"N, N"
23120,37-2022-00052501-CL-UD-CTL,San Diego,92101,12,2022,,N,N,"N, N"
23121,37-2022-00052502-CL-UD-CTL,San Marcos,92069,12,2022,,N,N,"N, N"
23122,37-2022-00052503-CL-UD-CTL,Oceanside,92054,12,2022,,Y,N,"N, N"
