In [244]:
import numpy as np

import sklearn.cluster
import sklearn.preprocessing
import sklearn.manifold

import scipy.stats as stats

from bokeh.charts import Scatter
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.models import Label
from bokeh.palettes import plasma

## Regression Analysis

#### Correlation between doctorate degrees and crime

In [103]:
cursor = coll2.find({'ed_level.4': {'$exists': 1}, 'crime_rate': {'$exists': 1}}, {'ed_level.4': 1, 'name': 1, 'crime_rate': 1})

index, cr, ed = [], [], []
for pl in tqdm(cursor):
    index.append(pl['name'])
    cr.append(pl['crime_rate'])
    ed.append(pl['ed_level']['4'])

0it [00:00, ?it/s]102it [00:00, 386.73it/s]24984it [00:00, 552.11it/s]30707it [00:00, 79171.33it/s]


In [107]:
# Returns (r-value, p-value)
stats.pearsonr(ed, cr) 

(-0.0046758052013247823, 0.41259714781487766)

#### Correlation between Masters/Professional Degrees and Crime 

In [109]:
cursor = coll2.find({'ed_level.3': {'$exists': 1}, 'crime_rate': {'$exists': 1}}, {'ed_level.3': 1, 'name': 1, 'crime_rate': 1})

index, cr, ed = [], [], []
for pl in tqdm(cursor):
    index.append(pl['name'])
    cr.append(pl['crime_rate'])
    ed.append(pl['ed_level']['3'])

0it [00:00, ?it/s]102it [00:00, 370.72it/s]26231it [00:00, 529.28it/s]30707it [00:00, 78486.76it/s]


In [110]:
stats.pearsonr(ed, cr)

(-0.0056297837013149412, 0.32388884484233282)

#### Correlation between College Degree or 1+ year attendance of college and Crime

In [111]:
cursor = coll2.find({'ed_level.2': {'$exists': 1}, 'crime_rate': {'$exists': 1}}, {'ed_level.2': 1, 'name': 1, 'crime_rate': 1})

index, cr, ed = [], [], []
for pl in tqdm(cursor):
    index.append(pl['name'])
    cr.append(pl['crime_rate'])
    ed.append(pl['ed_level']['2'])

0it [00:00, ?it/s]102it [00:00, 400.41it/s]27129it [00:00, 571.65it/s]30707it [00:00, 83476.89it/s]


In [112]:
stats.pearsonr(ed, cr)

(0.0091422773553193441, 0.10915400193132264)

####  Correlation between High School Education Rate and Crime

In [113]:
cursor = coll2.find({'ed_level.1': {'$exists': 1}, 'crime_rate': {'$exists': 1}}, {'ed_level.1': 1, 'name': 1, 'crime_rate': 1})

index, cr, ed = [], [], []
for pl in tqdm(cursor):
    index.append(pl['name'])
    cr.append(pl['crime_rate'])
    ed.append(pl['ed_level']['1'])

0it [00:00, ?it/s]102it [00:00, 400.41it/s]22968it [00:00, 571.58it/s]30707it [00:00, 78968.13it/s]


In [116]:
stats.pearsonr(ed, cr), stats.spearmanr(ed, cr)

((0.00052436402301413154, 0.92679120596422149),
 SpearmanrResult(correlation=-0.0092007042270162622, pvalue=0.10690844487168247))

In [293]:
output_notebook()

In [223]:
# Create a list of tuples (title, names, education rate, crime)
titles = {0: 'Up to 12 Grade'
         ,1: 'High School or Equivalent'
         ,2: 'College Educated(Degree or 1+ Years of Attendnce)'
         ,3: 'Professional School or Masters'
         ,4: 'Doctorate'}
ec_dict = {'name': [], 'crime':[], '0': [], '1': [], '2': [], '3': [], '4': []}

cursor = coll2.find({'ed_level': {'$exists': 1}, 'crime_rate': {'$exists': 1}},
                    {'ed_level': 1, 'name': 1, 'crime_rate': 1})

for pl in cursor:
    ec_dict['name'].append(pl['name'])
    for n in range(5):
        ec_dict[str(n)].append(pl['ed_level'][str(n)])
    ec_dict['crime'].append(pl['crime_rate'])

In [177]:
ec = pd.DataFrame(ec_dict)
ec.head()

Unnamed: 0,0,1,2,3,4,crime,name
0,0.72093,0.27907,0.0,0.0,0.0,0.064534,abanda cdp
1,0.331793,0.318952,0.285054,0.062147,0.002054,0.033433,abbeville city
2,0.187172,0.471477,0.291088,0.048104,0.002158,0.051562,adamsville city
3,0.310395,0.41142,0.199122,0.079063,0.0,0.024654,addison town
4,0.318408,0.383085,0.218905,0.079602,0.0,0.030532,akron town


In [None]:
# Pearson R
plts = [Scatter(ec, y='crime', x=str(n), title=titles[n], xlabel='Education Rate') for n in range(5)]
for p in enumerate(plts):
    p[1].add_layout(Label(x=60, y=50, x_units='screen', y_units='screen',  text='Pearson R:' + str(stats.pearsonr(ec['crime'], ec[str(p[0])]))))
grid = gridplot(plts, ncols=3, plot_width=500)

show(grid)

In [None]:
# Spearman R
plts = [Scatter(ec, y='crime', x=str(n), title=titles[n], xlabel='Education Rate') for n in range(5)]
for p in enumerate(plts):
    p[1].add_layout(Label(x=60, y=50, x_units='screen', y_units='screen',  text='Pearson R:' + str(stats.spearmanr(ec['crime'], ec[str(p[0])]))))
grid = gridplot(plts, ncols=3, plot_width=500)

show(grid)

## Cluster Analysis 

In [235]:
# Creates the classifier which should structure the data into 5 clusters
km = sklearn.cluster.KMeans(n_clusters=5, n_jobs=-1)

In [238]:
# Selects all of the places with median_housing_costs and democrat fields
cursor = coll2.find({'median_housing_costs': {'$exists': 1}, 'democrat': {'$exists': 1}})
housing_costs, democrat = [],[]
for pl in cursor:
    housing_costs.append(pl['median_housing_costs'])
    democrat.append(pl['democrat'])

In [253]:
print(len(democrat), len(housing_costs))
dem_costs = np.array([democrat,housing_costs]).T
dem_costs.shape

26028 26028


(26028, 2)

In [254]:
# Scales the data around mean with a stdev of 1
dem_costs = sklearn.preprocessing.scale(dem_costs, axis=0)
dem_costs[:,].std()



0.99999999999999989

In [256]:
# Map colors
ind = km.fit_predict(dem_costs)

colors = bokeh.palettes.plasma(len(np.unique(ind)))
color_dict = {num: c for c, num in zip(colors, np.unique(ind))}
ind_s = pd.Series(ind)
ind_s.replace(color_dict, inplace=True)

In [None]:
f = figure(y_axis_label='Normalized Democrat %', x_axis_label='Normalized Median Housing Cost')
f.scatter(x=dem_costs[:,0], y=dem_costs[:,1], color=ind_s)
show(f)

## Dimensionality Reduction and Clustering

In [344]:
# Creates an instance of the dimensionality reduction algorithm with 2 components
ts = sklearn.manifold.TSNE(n_components=2, learning_rate=20, early_exaggeration=16, n_iter=2500)

In [302]:
coll.find_one()

{'_id': 'AL',
 'census_division': 'East South Central',
 'crime_rate': 0.0158577070827861,
 'democrat': 34.6,
 'ed_level': {'0': 0.14876174515627894,
  '1': 0.34437936885000575,
  '2': 0.4058431684083642,
  '3': 0.08860488358678324,
  '4': 0.012410833998567832},
 'fips': 1,
 'hpi': 0.0310071987368759,
 'median_housing_costs': 745,
 'name': 'alabama',
 'price_index': 89.85555555555555,
 'republican': 62.9}

In [316]:
crime_rate, democrat, ed0, ed2, ed3, hpi, median_housing_costs, price_index = [],[],[],[],[],[],[],[]
cursor = coll2.find({'democrat': {'$exists': 1},'ed_level.0': {'$exists': 1},
                     'ed_level.2': {'$exists': 1},'ed_level.3': {'$exists': 1},
                     'hpi_rate': {'$exists': 1},'price_index': {'$exists': 1},
                     'crime_rate': {'$exists': 1},'median_housing_costs': {'$exists': 1},})
for pl in cursor:
    crime_rate.append(pl['crime_rate'])
    democrat.append(pl['democrat'])
    ed0.append(pl['ed_level']['0'])
    ed2.append(pl['ed_level']['2'])
    ed3.append(pl['ed_level']['3'])
    hpi.append(pl['hpi_rate'])
    median_housing_costs.append(pl['median_housing_costs'])
    price_index.append(pl['price_index'])

multi = np.array([crime_rate, democrat, ed0, ed2, ed3, hpi, median_housing_costs, price_index])

In [317]:
multi = multi.T
multi = sklearn.preprocessing.scale(multi, axis=0)



In [345]:
t = ts.fit_transform(multi)

In [346]:
p = figure()
p.scatter(x=t[:,0], y=t[:,1])
show(p)