In [668]:
import pickle
import sklearn

import pandas as pd
from scipy.stats import pearsonr

from sklearn.pipeline import Pipeline

In [669]:
# Aggregate tree information over neighborhood

# cleaned tree data
tree_data = pd.read_csv("cleaned_data/cleaned_tree_data_5.csv", low_memory=False)
tree_data = tree_data[~tree_data['common_name'].isin(['Stump', 'Vacant Site Large', 'Vacant Site Medium', 'Vacant Site Not Suitable', 'Vacant Site Small'])]
tree_data = tree_data.drop(labels= ['address_number', 'street', 'common_name'], axis = 1)

# categorize benefits by neighborhood
tree_info = tree_data[['neighborhood', 'id']].groupby('neighborhood').count().rename(columns={"id": "tree_count"})
tree_info.head()

Unnamed: 0_level_0,tree_count
neighborhood,Unnamed: 1_level_1
Allegheny Center,861
Allegheny West,368
Allentown,60
Arlington,82
Arlington Heights,4


In [670]:
# 2015 education data
# LINK: https://data.wprdc.org/dataset/pittsburgh-american-community-survey-2015-miscellaneous-data/resource/12535b2e-6180-4cdf-b7d8-ec5294259e49

# load data
education_data = pd.read_csv("raw_data/educational-attainment-for-the-population-25-years-and-over.csv")
# keep relevant columns
education_data = education_data[['Neighborhood', 'Estimate; Total:', 'Estimate; Total: - Regular high school diploma', 'Estimate; Total: - Bachelor\'s degree', 'Estimate; Total: - Master\'s degree']]

# cumulate data; those who have a master's degree will also have a bachelors degree and a high school diploma and so on
education_data["per_master"] = education_data['Estimate; Total: - Master\'s degree']/education_data['Estimate; Total:']
education_data["per_bachelor"] = education_data["per_master"] + education_data['Estimate; Total: - Bachelor\'s degree']/education_data['Estimate; Total:']
education_data["per_diploma"] = education_data["per_bachelor"] + education_data['Estimate; Total: - Regular high school diploma']/education_data['Estimate; Total:']

education_data = education_data.drop(labels = ['Estimate; Total:', 'Estimate; Total: - Regular high school diploma', 'Estimate; Total: - Bachelor\'s degree', 'Estimate; Total: - Master\'s degree'], axis = 1)
education_data = education_data.rename(columns={"Neighborhood": "neighborhood"}).groupby('neighborhood').sum()

education_data.head(3)

Unnamed: 0_level_0,per_master,per_bachelor,per_diploma
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Allegheny Center,0.07984,0.353293,0.469062
Allegheny West,0.112628,0.412969,0.535836
Allentown,0.022194,0.066582,0.433735


In [671]:
# 2010 area data
# LINK: https://data.wprdc.org/dataset/neighborhoods-with-snap-data/resource/bce22c26-9d3e-4e3f-8405-a35c4b7765b6

area_data = pd.read_csv("raw_data/Neighborhoods_with_SNAP_Data.csv")[['Street_Density__st__mi_area_sq_', 'Park_Space__acres_1000_pers__', 'SNAP_All_csv__Part_1__Major_Cri', 'SNAP_All_csv_2009_Median_Income', 'Total___Jobs_Located_in_N_hood_', 'Neighborhood_2010_HOOD', 'Neighborhood_2010_SQMILES', 'Pop__2010', 'Est__Pop__Under_Poverty__2010_', 'SNAP_All_csv_Residential', 'SNAP_All_csv_Mixed_Use___Indust', 'SNAP_All_csv_Mixed_Use___Commer', 'SNAP_All_csv_Median_Home__Value']]
# rename columns 
area_data = area_data.rename(columns={"Neighborhood_2010_SQMILES": "neighborhood_area", 
                                      "Pop__2010":"population_2010", 
                                      "Est__Pop__Under_Poverty__2010_":"population_under_poverty_2010", 
                                      'SNAP_All_csv_Residential':"per_residential_area", 
                                      'SNAP_All_csv_Mixed_Use___Indust':"per_industrial_area", 
                                      'SNAP_All_csv_Mixed_Use___Commer':"per_commercial_area", 
                                      'Neighborhood_2010_HOOD':"neighborhood",
                                      'SNAP_All_csv_Median_Home__Value':"median_home_value",
                                      'Total___Jobs_Located_in_N_hood_':"total_neighborhood_jobs", # /area
                                      'SNAP_All_csv_2009_Median_Income':'median_income', 
                                      'SNAP_All_csv__Part_1__Major_Cri':"major_crime", # /population
                                      'Park_Space__acres_1000_pers__':"park_space_density",
                                      'Street_Density__st__mi_area_sq_':"street_density"})

# clean data and extract numeric values from strings
numeric_data = ['per_residential_area', 'per_commercial_area', 'per_industrial_area', 'median_home_value']
for i in numeric_data:
    area_data[i] = area_data[i].str.replace(r'[^\d\.]+', '', regex = True).astype(float)

# get density based numerics
area_data['per_population_under_poverty_2010'] = area_data['population_under_poverty_2010']/area_data['population_2010']
area_data['population_density'] = area_data['population_2010']/area_data['neighborhood_area']
area_data['per_major_crime'] = area_data['major_crime']/area_data['population_2010']
area_data['per_total_neighborhood_jobs'] = area_data['total_neighborhood_jobs']/area_data['neighborhood_area']

area_data = area_data.drop(labels = ['total_neighborhood_jobs', 'major_crime'], axis = 1)
area_data = area_data.groupby('neighborhood').sum()

area_data.head(3)

Unnamed: 0_level_0,street_density,park_space_density,median_income,neighborhood_area,population_2010,population_under_poverty_2010,per_residential_area,per_industrial_area,per_commercial_area,median_home_value,per_population_under_poverty_2010,population_density,per_major_crime,per_total_neighborhood_jobs
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Allegheny Center,19.7,59.2,20911,0.21,933,324,0.0,0.0,49.4,136300.0,0.347267,4442.857143,0.076099,25761.904762
Allegheny West,34.5,15.5,41761,0.141,462,12,42.1,6.3,12.5,123600.0,0.025974,3276.595745,0.082251,9177.304965
Allentown,41.8,15.8,29274,0.295,2500,630,54.6,0.0,9.3,42200.0,0.252,8474.576271,0.0636,1203.389831


In [672]:
# combine the data by neighborhood
complete_data = tree_info.join(education_data).join(area_data)

# get area normalized values for tree benefits
for i in ['tree_count']:
    complete_data['area_norm_' + i] = complete_data[i] / complete_data['neighborhood_area']
    complete_data = complete_data.drop(labels = [i], axis = 1)

In [673]:
complete_data.describe()

Unnamed: 0,per_master,per_bachelor,per_diploma,street_density,park_space_density,median_income,neighborhood_area,population_2010,population_under_poverty_2010,per_residential_area,per_industrial_area,per_commercial_area,median_home_value,per_population_under_poverty_2010,population_density,per_major_crime,per_total_neighborhood_jobs,area_norm_tree_count
count,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0
mean,0.096769,0.259648,0.523788,28.963333,13.273333,36853.933333,0.586522,3396.711111,704.144444,48.216667,11.544444,5.928889,84674.722222,0.338096,6238.813926,0.084832,6649.236365,806.922695
std,0.085053,0.158884,0.113107,10.767246,28.463344,21000.256457,0.460925,3233.349525,653.354293,24.391826,17.683752,8.006602,58970.506301,0.895953,4920.893845,0.251376,17588.111011,771.465849
min,0.0,0.0,0.041322,5.6,0.0,0.0,0.103,11.0,0.0,0.0,0.0,0.0,0.0,0.0,28.720627,0.002994,15.11335,0.571429
25%,0.043323,0.139113,0.465203,21.85,1.225,25349.75,0.27925,1008.25,186.75,32.35,0.0,0.925,51362.5,0.114464,3382.916229,0.022844,776.138146,258.201604
50%,0.0736,0.25055,0.535775,28.7,3.75,33436.5,0.431,2462.5,511.5,49.3,3.55,3.3,72800.0,0.207,5216.912351,0.03993,1448.046265,573.373573
75%,0.118116,0.363463,0.587193,35.275,10.525,41693.25,0.76825,4396.5,1046.25,65.725,14.15,8.025,98275.0,0.322414,7463.65731,0.058931,6358.665148,1243.862628
max,0.5,0.619048,1.0,71.8,198.1,150250.0,2.671,15110.0,2897.0,94.0,80.4,49.4,344900.0,8.363636,26160.17316,1.947368,149289.0625,4100.0


In [674]:
# save processed file to csv
complete_data.to_csv("cleaned_data/ml_data.csv", index = True)

In [675]:
from sklearn import tree
import numpy as np

In [676]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import svm

In [677]:
c_data = complete_data

In [678]:
predictor_col = 'area_norm_tree_count'

X = c_data.drop(labels = [predictor_col], axis = 1)
X_labels = X.columns
X = np.array(X)
y = np.array(c_data[predictor_col])

In [679]:
X_final = list()
for i in range(len(X[0])):
    corr, _ = pearsonr(X[::,i], y)
    if np.abs(corr) > 0.35:
        print(X_labels[i], " ", corr, X[::,i][:5])
        X_final.append(X[::,i])
    

per_master   0.3728903692867985 [0.07984032 0.11262799 0.02219404 0.01929012 0.        ]
per_bachelor   0.48067450528900596 [0.35329341 0.41296928 0.06658212 0.09182099 0.        ]
per_commercial_area   0.4961887386378545 [49.4 12.5  9.3  3.8  0. ]
median_home_value   0.4763690106131646 [136300. 123600.  42200.  44200.  64400.]
population_density   0.4561757212498415 [4442.85714286 3276.59574468 8474.57627119 3976.59574468 1848.48484848]


In [680]:
X = np.array(X_final).T

In [681]:
X.shape

(90, 5)

In [682]:
num_folds = 6
kf = KFold(n_splits=num_folds)

test_acc = list()
train_acc = list()

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    pipe = Pipeline([('scaler', sklearn.preprocessing.StandardScaler()), ('ridge', sklearn.linear_model.Ridge())])
    
    pipe.fit(X_train, y_train)
    # print(pipe.predict(X_test))
    train_acc.append(pipe.score(X_train, y_train))
    test_acc.append(pipe.score(X_test, y_test))
    
print(train_acc)
print(np.mean(train_acc))
print(test_acc)
print(np.mean(test_acc))

[0.5668639989978141, 0.5671625822468158, 0.5646928256704514, 0.6665512415326598, 0.575315506734001, 0.607013386613063]
0.5912665902991342
[0.48498861945370986, 0.6122299685726893, 0.6142280275363197, -0.33221712429086336, 0.5917811327035983, 0.001142345473798323]
0.3286921615748753


In [683]:
# train and save model
pipe = Pipeline([('scaler', sklearn.preprocessing.StandardScaler()), ('ridge', sklearn.linear_model.Ridge())])
pipe.fit(X, y)

model_file = "model.pkl"
pickle.dump(pipe, open(model_file,"wb"))