In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(r'C:\Users\porte\OneDrive\INTEX\Simplified_PIT_data.csv')

In [4]:
# See unsheltered column names
columnList = df.columns
columnList[20:]

Index(['Unsheltered Homeless', 'Unsheltered Homeless - Under 18',
       'Unsheltered Homeless - Age 18 to 24', 'Unsheltered Homeless - Over 24',
       'Unsheltered Homeless - Female', 'Unsheltered Homeless - Male',
       'Unsheltered Homeless - Transgender',
       'Unsheltered Homeless - Gender that is not Singularly Female or Male',
       'Unsheltered Homeless - Non-Hispanic/Non-Latin(o)(a)(x)',
       'Unsheltered Homeless - Hispanic/Latin(o)(a)(x)',
       'Unsheltered Homeless - White',
       'Unsheltered Homeless - Black, African American, or African',
       'Unsheltered Homeless - Asian or Asian American',
       'Unsheltered Homeless - American Indian, Alaska Native, or Indigenous',
       'Unsheltered Homeless - Native Hawaiian or Other Pacific Islander',
       'Unsheltered Homeless - Multiple Races',
       'Sheltered Total Homeless - Gender Questioning',
       'Unsheltered Homeless - Gender Questioning'],
      dtype='object')

In [5]:
# Convert all columns to numeric data for statistical analysis
numeric_df = df.apply(pd.to_numeric, errors='coerce')


In [6]:
numeric_df.describe()

Unnamed: 0,CoC Number,CoC Name,Count Types,Year,Sheltered Total Homeless,Sheltered Total Homeless - Under 18,Sheltered Total Homeless - Age 18 to 24,Sheltered Total Homeless - Over 24,Sheltered Total Homeless - Female,Sheltered Total Homeless - Male,...,Unsheltered Homeless - Non-Hispanic/Non-Latin(o)(a)(x),Unsheltered Homeless - Hispanic/Latin(o)(a)(x),Unsheltered Homeless - White,"Unsheltered Homeless - Black, African American, or African",Unsheltered Homeless - Asian or Asian American,"Unsheltered Homeless - American Indian, Alaska Native, or Indigenous",Unsheltered Homeless - Native Hawaiian or Other Pacific Islander,Unsheltered Homeless - Multiple Races,Sheltered Total Homeless - Gender Questioning,Unsheltered Homeless - Gender Questioning
count,0.0,0.0,0.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,...,2275.0,2276.0,2276.0,2276.0,2276.0,2276.0,2276.0,2277.0,386.0,386.0
mean,,,,2019.501947,910.77196,244.853743,74.235396,591.682821,401.865859,504.316313,...,370.373187,113.228032,276.792619,132.042619,7.940246,21.739895,10.456063,35.241985,0.341969,1.235751
std,,,,1.708869,3736.543472,1214.650357,355.573728,2181.75097,1773.01077,1948.181359,...,1366.502614,803.829135,1032.02607,892.57964,48.524116,93.067604,69.598796,155.719978,1.377802,8.609199
min,,,,2017.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,2018.0,192.5,44.0,13.0,129.0,85.0,103.0,...,23.0,1.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,,2020.0,372.0,91.0,28.0,245.0,162.0,206.0,...,84.0,6.0,53.0,17.0,0.0,1.0,0.0,3.0,0.0,0.0
75%,,,,2021.0,800.0,182.0,62.0,560.0,331.0,468.5,...,306.0,33.0,221.0,65.0,2.0,8.0,2.0,13.0,0.0,0.0
max,,,,2022.0,74982.0,25825.0,7553.0,43193.0,36754.0,38550.0,...,29364.0,21270.0,21854.0,20469.0,975.0,1855.0,1278.0,2601.0,21.0,125.0


In [7]:
numeric_df['Unsheltered Homeless - Female'].isna().sum()

34

In [8]:
# Drop rows with null values in female and male columns
numeric_df.dropna(subset=['Unsheltered Homeless - Female'], inplace=True)
numeric_df.dropna(subset=['Unsheltered Homeless - Male'], inplace=True)


In [9]:
# Pearson r test for gender
from scipy import stats
r, p = stats.pearsonr(numeric_df['Unsheltered Homeless - Female'], numeric_df['Unsheltered Homeless - Male'])
print(f'r: {r}')
print(f'p: {p}')


r: 0.9831776076616021
p: 0.0


The Pearson correlation coefficient r of 0.983 indicates a very strong positive linear relationship between the male and female homelessness. This suggests that as the number of unsheltered homeless females increases, the number of unsheltered homeless males also tends to increase in a very consistent manner.

The p-value of 0.0 means that the correlation is statistically significant, indicating strong evidence against the null hypothesis of no correlation. In other words, there's a very high likelihood that this correlation is not due to random chance.

In [10]:
# Comparing male to female homelessness
male_homeless = numeric_df['Unsheltered Homeless - Male'].sum()
female_homeless = numeric_df['Unsheltered Homeless - Female'].sum()
ratio = male_homeless / female_homeless
print(f'Ratio of male homelessness to female homelessness: {round(ratio,2)}')

Ratio of male homelessness to female homelessness: 2.34


The strong correlation between male and female homelessness does not mean that there is the same rate of male and female homelessness. The ratio of male to female homelessness is 2.34, meaning that for every 100 homeless women, there are 234 homeless men.


In [11]:
# Comparing sheltered to unsheltered homelessness
total_sheltered = numeric_df['Sheltered Total Homeless'].sum()
total_unsheltered = numeric_df['Unsheltered Homeless'].sum()
print(f"Total Sheltered: {total_sheltered}, Total Unsheltered: {total_unsheltered}")
print(f'Ratio of sheltered to unsheltered: {round(total_sheltered/total_unsheltered, 2)}')

Total Sheltered: 2089284, Total Unsheltered: 1101802
Ratio of sheltered to unsheltered: 1.9


In [12]:
# Age and Homelessness anova test
from scipy import stats
f_stat, p_value = stats.f_oneway(
    numeric_df['Sheltered Total Homeless - Under 18'],
    numeric_df['Sheltered Total Homeless - Age 18 to 24'],
    numeric_df['Sheltered Total Homeless - Over 24']
)

print("F-statistic:", f_stat)
print("P-value:", p_value)

F-statistic: 74.58541140655979
P-value: 9.051699738832837e-33


This test shows a statistically significant difference in the means of the three age groups of sheltered homeless individuals. The very small p-value indicates that the number of sheltered homeless individuals significantly differs across the groups of "Under 18," "Age 18 to 24," and "Over 24."

In [13]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Tukey's HSD test
tukey_result = pairwise_tukeyhsd(
    endog=numeric_df[['Sheltered Total Homeless - Under 18', 
                      'Sheltered Total Homeless - Age 18 to 24', 
                      'Sheltered Total Homeless - Over 24']].melt()['value'], 
    groups=numeric_df[['Sheltered Total Homeless - Under 18', 
                       'Sheltered Total Homeless - Age 18 to 24', 
                       'Sheltered Total Homeless - Over 24']].melt()['variable'],
    alpha=0.05
)

print(tukey_result)


                                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                                 
                 group1                                group2               meandiff p-adj    lower    upper   reject
---------------------------------------------------------------------------------------------------------------------
Sheltered Total Homeless - Age 18 to 24  Sheltered Total Homeless - Over 24 521.0316    0.0  419.1307 622.9325   True
Sheltered Total Homeless - Age 18 to 24 Sheltered Total Homeless - Under 18 172.2227 0.0002   70.3217 274.1236   True
     Sheltered Total Homeless - Over 24 Sheltered Total Homeless - Under 18 -348.809    0.0 -450.7099 -246.908   True
---------------------------------------------------------------------------------------------------------------------


A p-value of less than 0.05 in Tukey's HSD (Honestly Significant Difference) test indicates that there is a statistically significant difference between the two groups being compared. Specifically, it suggests that the means of the two groups are different enough that this difference is unlikely to have occurred by random chance.
Since the p-values are all less than 0.05, it means that for each pair of groups tested, there is a statistically significant difference in the number of sheltered homeless individuals between those groups.

In [14]:
# Ethnicity and homelessness anova
ethnicity_groups = numeric_df[['Sheltered Total Homeless - Hispanic/Latin(o)(a)(x)', 'Sheltered Total Homeless - Black, African American, or African', 'Sheltered Total Homeless - White', 'Sheltered Total Homeless - Asian or Asian American',
       'Sheltered Total Homeless - American Indian, Alaska Native, or Indigenous',
       'Sheltered Total Homeless - Native Hawaiian or Other Pacific Islander',
       'Sheltered Total Homeless - Multiple Races', 'Unsheltered Homeless',]].values
f_stat, p_value = stats.f_oneway(ethnicity_groups[0], ethnicity_groups[1], ethnicity_groups[2], ethnicity_groups[3], ethnicity_groups[4], ethnicity_groups[5], ethnicity_groups[6])
print(f"F-statistic: {f_stat}, P-value: {p_value}")


F-statistic: 1.3511326701610447, P-value: 0.2530993315372138


Since the p-value is greater than 0.05, we fail to reject the null hypothesis. This means that there is no statistically significant difference in the number of homeless individuals across different ethnicities. In other words, ethnicity does not appear to have a significant effect on homelessness in your dataset.

In [16]:
# Pearson r test for sheltered vs unsheltered
from scipy import stats
r, p = stats.pearsonr(numeric_df['Unsheltered Homeless'], numeric_df['Sheltered Total Homeless'])
print(f'r: {r}')
print(f'p: {p}')

r: 0.28256175310333964
p: 4.613853341261202e-43


Since the p-value is far below 0.05, we reject the null hypothesis, indicating that there is a significant relationship between being sheltered or unsheltered. This suggests that the number of people who are sheltered versus unsheltered is not random and may depend on factors like resources, access to shelters, policies, or other external factors.