In [34]:
import numpy as np
import pandas as pd
from scipy import stats
from ydata_profiling import ProfileReport
import prince

In [4]:
ahs = pd.read_csv("ahs_cleaned-1.csv", na_values = [-6, "'-9'"])

In [6]:
ahs.head(3).T

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
DIVISION,South Atlantic,New England,West South Central
TENURE,Owned or being bought by someone in your house...,Owned or being bought by someone in your house...,
YRBUILT,2000,1970,1970
UNITSIZE,"2,000 to 2,499 square feet","3,000 to 3,999 square feet",750 to 999 square feet
HSHLDTYPE,Married-couple family household,Nonfamily household,
HHRACE,White only,White only,
HHSEX,Male,Female,
HINCP,257000.0,201000.0,
TOTHCAMT,1642.0,1049.0,


In [8]:
profile = ProfileReport(ahs,
                        title = "American Housing Survey EDA",
                        html = {"style": {"full_width": True}},
                        minimal = False)
profile.to_notebook_iframe()

Summarize dataset: 100%|█████████████| 80/80 [00:29<00:00,  2.67it/s, Completed]
Generate report structure: 100%|██████████████████| 1/1 [00:13<00:00, 13.45s/it]
Render HTML: 100%|████████████████████████████████| 1/1 [00:02<00:00,  2.24s/it]


EDA is for taking a first look at distributions and relationships

Distributions:
* Frequency tables for categorical features
* Descriptive stats (mean, median, st dev) for continuous features
* Wordclouds for pure text

Relationships:
* Two continuous features -- correlations
* One continuous, one categorical -- categorical means table via groupby
* Two categorical -- cross-tabs
* Any relationship table can be shown as a hypothesis test if you want

In [9]:
ahs["TENURE"].value_counts()

TENURE
Owned or being bought by someone in your household    32972
Rented                                                20743
Occupied without payment of rent                        740
Name: count, dtype: int64

In [14]:
import sidetable
ahs.stb.freq(["TENURE"])

Unnamed: 0,TENURE,count,percent,cumulative_count,cumulative_percent
0,Owned or being bought by someone in your house...,32972,60.549077,32972,60.549077
1,Rented,20743,38.092003,53715,98.64108
2,Occupied without payment of rent,740,1.35892,54455,100.0


In [11]:
ahs['HINCP'].describe()

count    5.445500e+04
mean     8.706612e+04
std      1.000649e+05
min     -5.000000e+03
25%      2.750000e+04
50%      6.000000e+04
75%      1.110000e+05
max      3.876000e+06
Name: HINCP, dtype: float64

In [15]:
ahs[["MARKETVAL", "YRBUILT"]].corr()

Unnamed: 0,MARKETVAL,YRBUILT
MARKETVAL,1.0,-0.00403
YRBUILT,-0.00403,1.0


In [16]:
ahs.stb.freq(["YRBUILT"])

Unnamed: 0,YRBUILT,count,percent,cumulative_count,cumulative_percent
0,1970,9313,14.739258,9313,14.739258
1,1980,9072,14.357838,18385,29.097096
2,2000,8883,14.058716,27268,43.155812
3,1990,7863,12.444409,35131,55.600222
4,1960,6860,10.857007,41991,66.457229
5,1950,6330,10.018201,48321,76.475429
6,1919,3594,5.688059,51915,82.163488
7,1940,3001,4.749545,54916,86.913033
8,1920,2494,3.947139,57410,90.860173
9,1930,1699,2.688929,59109,93.549102


In [17]:
replace_map = {2011:2010,
               2012:2010,
               2013:2010,
               2014:2010,
               2015:2010,
               2016:2010,
               2017:2010,
               2018:2010,
               2019:2010}
ahs["YRBUILT"] = ahs["YRBUILT"].replace(replace_map)

In [18]:
ahs.stb.freq(["YRBUILT"])

Unnamed: 0,YRBUILT,count,percent,cumulative_count,cumulative_percent
0,1970,9313,14.739258,9313,14.739258
1,1980,9072,14.357838,18385,29.097096
2,2000,8883,14.058716,27268,43.155812
3,1990,7863,12.444409,35131,55.600222
4,1960,6860,10.857007,41991,66.457229
5,1950,6330,10.018201,48321,76.475429
6,2010,4076,6.450898,52397,82.926327
7,1919,3594,5.688059,55991,88.614386
8,1940,3001,4.749545,58992,93.363931
9,1920,2494,3.947139,61486,97.311071


In [19]:
ahs.groupby("YRBUILT").agg({"MARKETVAL": ["mean", "median"]})

Unnamed: 0_level_0,MARKETVAL,MARKETVAL
Unnamed: 0_level_1,mean,median
YRBUILT,Unnamed: 1_level_2,Unnamed: 2_level_2
1919,441787.054923,224025.0
1920,460242.827079,241479.0
1930,465825.451481,224702.0
1940,383461.994423,210377.0
1950,357163.403623,218444.5
1960,355834.658272,238382.0
1970,317178.414418,217358.0
1980,347564.607041,248374.0
1990,363917.258788,275944.0
2000,391545.044761,297737.0


In [21]:
pd.crosstab(ahs["RODENT"], ahs["TENURE"])

TENURE,Occupied without payment of rent,Owned or being bought by someone in your household,Rented
RODENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No signs in the last 12 months,618,29454,18749
Seen a few times in the last 12 months,80,2849,1283
Seen daily in the last 12 months,15,154,305
Seen monthly in the last 12 months,17,317,188
Seen weekly in the last 12 months,10,198,218


In [22]:
round(100*pd.crosstab(ahs["RODENT"], ahs["TENURE"], normalize='columns'), 2)

TENURE,Occupied without payment of rent,Owned or being bought by someone in your household,Rented
RODENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No signs in the last 12 months,83.51,89.33,90.39
Seen a few times in the last 12 months,10.81,8.64,6.19
Seen daily in the last 12 months,2.03,0.47,1.47
Seen monthly in the last 12 months,2.3,0.96,0.91
Seen weekly in the last 12 months,1.35,0.6,1.05


In [23]:
table = pd.crosstab(ahs["RODENT"], ahs["TENURE"])
stats.chi2_contingency(table)

Chi2ContingencyResult(statistic=np.float64(322.5045626859438), pvalue=np.float64(6.629715545876815e-65), dof=8, expected_freq=array([[6.63438435e+02, 2.95606650e+04, 1.85968966e+04],
       [5.72377192e+01, 2.55032713e+03, 1.60443515e+03],
       [6.44128179e+00, 2.87002626e+02, 1.80556092e+02],
       [7.09356349e+00, 3.16066183e+02, 1.98840253e+02],
       [5.78900009e+00, 2.57939069e+02, 1.62271931e+02]]))

In [24]:
nosquat = ahs.query("TENURE != 'Occupied without payment of rent'")

In [25]:
table = pd.crosstab(nosquat["RODENT"], nosquat["TENURE"])
stats.chi2_contingency(table)

Chi2ContingencyResult(statistic=np.float64(285.14643048024914), pvalue=np.float64(1.7310626272296368e-60), dof=4, expected_freq=array([[29588.55656707, 18614.44343293],
       [ 2536.35491017,  1595.64508983],
       [  281.74900866,   177.25099134],
       [  309.98529275,   195.01470725],
       [  255.35422135,   160.64577865]]))

In [26]:
table

TENURE,Owned or being bought by someone in your household,Rented
RODENT,Unnamed: 1_level_1,Unnamed: 2_level_1
No signs in the last 12 months,29454,18749
Seen a few times in the last 12 months,2849,1283
Seen daily in the last 12 months,154,305
Seen monthly in the last 12 months,317,188
Seen weekly in the last 12 months,198,218


In [30]:
#MCA: Multiple Correspondence Analysis
broken = ahs[['FUSEBLOW', 'SEWBREAK', 'ROACH', 'RODENT', 'NOWIRE', 'PLUGS', 'COLD',
       'NOTOIL', 'NOWAT', 'FLOORHOLE', 'FNDCRUMB', 'PAINTPEEL', 'ROOFHOLE',
       'ROOFSAG', 'ROOFSHIN', 'WALLCRACK', 'WALLSIDE', 'WALLSLOPE', 'WINBOARD',
       'WINBROKE', 'LEAKI', 'MOLDBATH']].dropna()

In [35]:
MCA = prince.MCA(n_components = 2)
MCAbroken = MCA.fit(broken)

In [38]:
MCAbroken.column_coordinates(broken).sort_values(1)

Unnamed: 0,0,1
WALLSLOPE_Broken,4.859669,-2.822837
ROOFSAG_Broken,3.902618,-2.367831
ROOFHOLE_Broken,4.146936,-2.197749
ROOFSHIN_Broken,2.383511,-1.611424
WALLSIDE_Broken,3.081932,-1.171049
WINBOARD_Broken,3.453391,-0.819302
FNDCRUMB_Broken,1.821288,-0.456563
FLOORHOLE_Broken,4.283991,-0.342381
WINBROKE_Broken,2.177878,-0.330458
LEAKI_Not broken,-0.080363,-0.094458


In [39]:
MCAbroken.row_coordinates(broken).rename({0: "brokenness", 1:"structure_vs_systems"}, axis = 1)

Unnamed: 0,brokenness,structure_vs_systems
0,-0.159316,-0.050666
1,-0.159316,-0.050666
3,-0.073487,0.032472
6,-0.072647,-0.046531
8,-0.159316,-0.050666
...,...,...
63180,-0.159316,-0.050666
63181,-0.107034,-0.030377
63182,-0.159316,-0.050666
63183,-0.159316,-0.050666
