In [1]:
import pandas as pd
import numpy as np

import altair as alt
import matplotlib.pyplot as plt

## Reading Data & Preprocessing

In [2]:
alt.data_transformers.enable(max_rows=250000)

data = pd.read_csv(filepath_or_buffer='./communities.data', header=None, index_col=None).values
data.shape

columns = [
    "state numeric","county numeric","community numeric","communityname string","fold numeric","population numeric",
"householdsize numeric","racepctblack numeric","racePctWhite numeric","racePctAsian numeric","racePctHisp numeric",
"agePct12t21 numeric","agePct12t29 numeric","agePct16t24 numeric","agePct65up numeric","numbUrban numeric",
"pctUrban numeric","medIncome numeric","pctWWage numeric","pctWFarmSelf numeric","pctWInvInc numeric",
"pctWSocSec numeric","pctWPubAsst numeric","pctWRetire numeric","medFamInc numeric","perCapInc numeric",
"whitePerCap numeric","blackPerCap numeric","indianPerCap numeric","AsianPerCap numeric","OtherPerCap numeric",
"HispPerCap numeric","NumUnderPov numeric","PctPopUnderPov numeric","PctLess9thGrade numeric","PctNotHSGrad numeric",
"PctBSorMore numeric","PctUnemployed numeric","PctEmploy numeric","PctEmplManu numeric","PctEmplProfServ numeric",
"PctOccupManu numeric","PctOccupMgmtProf numeric","MalePctDivorce numeric","MalePctNevMarr numeric","FemalePctDiv numeric",
"TotalPctDiv numeric","PersPerFam numeric","PctFam2Par numeric","PctKids2Par numeric","PctYoungKids2Par numeric",
"PctTeen2Par numeric","PctWorkMomYoungKids numeric","PctWorkMom numeric","NumIlleg numeric","PctIlleg numeric",
"NumImmig numeric","PctImmigRecent numeric","PctImmigRec5 numeric","PctImmigRec8 numeric","PctImmigRec10 numeric",
"PctRecentImmig numeric","PctRecImmig5 numeric","PctRecImmig8 numeric","PctRecImmig10 numeric",
"PctSpeakEnglOnly numeric","PctNotSpeakEnglWell numeric","PctLargHouseFam numeric","PctLargHouseOccup numeric",
"PersPerOccupHous numeric","PersPerOwnOccHous numeric","PersPerRentOccHous numeric","PctPersOwnOccup numeric","PctPersDenseHous numeric",
"PctHousLess3BR numeric","MedNumBR numeric","HousVacant numeric","PctHousOccup numeric","PctHousOwnOcc numeric",
"PctVacantBoarded numeric","PctVacMore6Mos numeric","MedYrHousBuilt numeric","PctHousNoPhone numeric",
"PctWOFullPlumb numeric","OwnOccLowQuart numeric","OwnOccMedVal numeric","OwnOccHiQuart numeric",
"RentLowQ numeric","RentMedian numeric","RentHighQ numeric","MedRent numeric","MedRentPctHousInc numeric",
"MedOwnCostPctInc numeric","MedOwnCostPctIncNoMtg numeric","NumInShelters numeric","NumStreet numeric",
"PctForeignBorn numeric","PctBornSameState numeric","PctSameHouse85 numeric","PctSameCity85 numeric",
"PctSameState85 numeric","LemasSwornFT numeric","LemasSwFTPerPop numeric","LemasSwFTFieldOps numeric",
"LemasSwFTFieldPerPop numeric","LemasTotalReq numeric","LemasTotReqPerPop numeric",
"PolicReqPerOffic numeric","PolicPerPop numeric","RacialMatchCommPol numeric","PctPolicWhite numeric",
"PctPolicBlack numeric","PctPolicHisp numeric","PctPolicAsian numeric","PctPolicMinor numeric",
"OfficAssgnDrugUnits numeric","NumKindsDrugsSeiz numeric","PolicAveOTWorked numeric","LandArea numeric",
"PopDens numeric","PctUsePubTrans numeric","PolicCars numeric","PolicOperBudg numeric","LemasPctPolicOnPatr numeric",
"LemasGangUnitDeploy numeric","LemasPctOfficDrugUn numeric","PolicBudgPerPop numeric","ViolentCrimesPerPop numeric",
]

df = pd.DataFrame(data=data, columns=columns)

num_features = columns[5:]

''' Here we simply convert non-numeric values into 0 '''
# this step change strings into NaN
processed = df[num_features].apply(pd.to_numeric, errors="coerce")

# then we change NaN to 0
processed = processed.replace(np.nan,0)

## Correlationship

In [3]:
corr = processed[num_features[:-1]].corr()

corr

Unnamed: 0,population numeric,householdsize numeric,racepctblack numeric,racePctWhite numeric,racePctAsian numeric,racePctHisp numeric,agePct12t21 numeric,agePct12t29 numeric,agePct16t24 numeric,agePct65up numeric,...,PolicAveOTWorked numeric,LandArea numeric,PopDens numeric,PctUsePubTrans numeric,PolicCars numeric,PolicOperBudg numeric,LemasPctPolicOnPatr numeric,LemasGangUnitDeploy numeric,LemasPctOfficDrugUn numeric,PolicBudgPerPop numeric
population numeric,1.000000,-0.046148,0.231178,-0.300845,0.181603,0.156218,0.006368,0.130344,0.075596,-0.102006,...,0.532969,0.713652,0.231897,0.270356,0.837546,0.797330,0.540904,0.471167,0.466352,0.410710
householdsize numeric,-0.046148,1.000000,-0.067109,-0.235907,0.201996,0.468659,0.520461,0.367338,0.295225,-0.612666,...,-0.058980,-0.015078,-0.004072,-0.051506,-0.086127,-0.053341,-0.100452,-0.073426,-0.094368,-0.113298
racepctblack numeric,0.231178,-0.067109,1.000000,-0.794389,-0.106738,-0.066581,0.122338,0.153475,0.134068,0.052934,...,0.178369,0.149758,0.095053,0.147023,0.255182,0.196044,0.231094,0.199190,0.260793,0.214650
racePctWhite numeric,-0.300845,-0.235907,-0.794389,1.000000,-0.270266,-0.444166,-0.194015,-0.266852,-0.183804,0.136483,...,-0.244530,-0.131389,-0.337458,-0.215636,-0.256943,-0.233994,-0.260874,-0.230091,-0.276234,-0.219685
racePctAsian numeric,0.181603,0.201996,-0.106738,-0.270266,1.000000,0.266743,-0.025020,0.100727,0.052761,-0.272020,...,0.174322,-0.001084,0.389944,0.296921,0.051910,0.112861,0.120840,0.128631,0.101888,0.077938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PolicOperBudg numeric,0.797330,-0.053341,0.196044,-0.233994,0.112861,0.117189,-0.020449,0.064460,0.033996,-0.032976,...,0.414180,0.529265,0.233281,0.272627,0.844864,1.000000,0.400359,0.331546,0.346562,0.487214
LemasPctPolicOnPatr numeric,0.540904,-0.100452,0.231094,-0.260874,0.120840,0.141579,-0.009625,0.116574,0.078563,-0.017710,...,0.741849,0.341831,0.263131,0.259561,0.512529,0.400359,1.000000,0.671129,0.857575,0.647339
LemasGangUnitDeploy numeric,0.471167,-0.073426,0.199190,-0.230091,0.128631,0.117990,0.000670,0.087242,0.054521,-0.044785,...,0.598143,0.348076,0.128368,0.128839,0.449205,0.331546,0.671129,1.000000,0.621957,0.509343
LemasPctOfficDrugUn numeric,0.466352,-0.094368,0.260793,-0.276234,0.101888,0.125353,0.001301,0.117290,0.083180,-0.010168,...,0.680129,0.299632,0.228084,0.206644,0.469950,0.346562,0.857575,0.621957,1.000000,0.672801


In [4]:
alt.data_transformers.enable(max_rows=20000)

DataTransformerRegistry.enable('default')

In [5]:
wide_form = corr.reset_index().rename(columns={'index':'attr1'})
to_plot = wide_form.melt('attr1', var_name='attr2', value_name='corr')

In [6]:
alt.Chart(to_plot).mark_rect().encode(
    x='attr1:N',
    y='attr2:N',
    color='corr:Q'
).properties(
    width = 800,
    height = 800
)

In [7]:
''' 
Customization:
    1) add mouseover effect, 
    2) sort by the sum of col/row,
    3) customize color using a diverging color scheme.
'''
alt.Chart(to_plot).mark_rect().encode(
    x=alt.X('attr1:N', 
            sort=alt.EncodingSortField(field='corr', op='sum', order='descending'), 
            title="attribute"),
    y=alt.Y('attr2:N', 
            sort=alt.EncodingSortField(field='corr', op='sum', order='descending'), 
            title="attribute"),
    color=alt.Color('corr', scale=alt.Scale(domain=[-1, 0, 1], range=['#2f78b3', '#f2f0eb', '#c5690d'])),
    tooltip = ['attr1', 'attr2', 'corr']
).properties(
    width = 800,
    height = 800
)

## Relevance to outcome (ViolentCrimesPerPop)
- Regression Coefficent Estimate

The general idea here is trying to build regression models on a single attribute and the outcome attribute. The higher the coefficent (absolute value) is, the more relevant an attribute to the outcome. 

In this example, we use the function `theilslopes()` from `scipy.stats` library. It gives you information of the median slope (coefficient) and intercept, as well as confidence interval of the slope.

Check the related information here: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.theilslopes.html. It also includes the comparison of the lines generated from `theilslopes()` and `linregress()` which uses the least-squares regression.


In [8]:
from scipy import stats 

target = processed['ViolentCrimesPerPop numeric']
res = stats.theilslopes(target, processed[num_features[0]], 0.95)

In [9]:
# the returned tuple contains: medslope, medintercept, lo_slope, up_slope
res

(1.0000000000000002, 0.13, 0.96, 1.1428571428571426)

In [10]:
coefficient = []

for col in num_features[:-1]:
    res = stats.theilslopes(target, processed[col], 0.95)
    coefficient.append([col, res[0], res[2], res[3]])
    
reg_plot = pd.DataFrame(data=coefficient, columns=['attribute', 'med_slope', 'low_slope', 'up_slope'])
    

In [11]:
dot = alt.Chart(reg_plot).mark_circle().encode(
    y=alt.Y('med_slope:Q'),
    x=alt.X('attribute:N', sort=alt.EncodingSortField('med_slope', order='descending'))
)

error = alt.Chart(reg_plot).mark_rule().encode(
    y=alt.Y('low_slope:Q'),
    y2='up_slope:Q',
    x=alt.X('attribute:N', sort=alt.EncodingSortField('med_slope', order='descending'))
)

dot + error

In [12]:
col_to_analysis = reg_plot.sort_values(by=['med_slope'], ascending=False)['attribute'].values
col_to_analysis

array(['NumIlleg numeric', 'LemasSwornFT numeric',
       'PolicOperBudg numeric', 'OfficAssgnDrugUnits numeric',
       'NumStreet numeric', 'LemasTotalReq numeric',
       'NumUnderPov numeric', 'NumInShelters numeric',
       'PolicCars numeric', 'NumImmig numeric', 'PctPolicHisp numeric',
       'HousVacant numeric', 'PolicBudgPerPop numeric',
       'PctPolicBlack numeric', 'LemasSwFTPerPop numeric',
       'PolicPerPop numeric', 'LemasTotReqPerPop numeric',
       'population numeric', 'PctPolicAsian numeric',
       'PctPolicMinor numeric', 'LemasSwFTFieldPerPop numeric',
       'PctPersDenseHous numeric', 'PctIlleg numeric',
       'PolicAveOTWorked numeric', 'numbUrban numeric',
       'PolicReqPerOffic numeric', 'racepctblack numeric',
       'FemalePctDiv numeric', 'pctWPubAsst numeric',
       'PctPopUnderPov numeric', 'TotalPctDiv numeric',
       'PctLargHouseFam numeric', 'MalePctDivorce numeric',
       'PctHousLess3BR numeric', 'PctUnemployed numeric',
       'PctVacan

Based only on this dataset, we found that a few attributes such as `NumIlleg` and `LemasSwornFT` etc. are highly significantly more important to the predition target `ViolentCrimesPerPop` than other attributes.

Some other attributes that are also important to the prediction of `ViolentCrimesPerPop` include ...