In [1]:
from glob import glob
import pandas as pd
import numpy as np
import requests

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score, f1_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils.class_weight import compute_sample_weight

import matplotlib.pyplot as plt
import geopandas as gpd

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [3]:
full = pd.read_csv('full.csv')
full.head()

Unnamed: 0.1,Unnamed: 0,bin,roofarea,svf,numbldgs,assesstot,bldgarea,comarea,yearbuilt_1900_1940,yearbuilt_1940_1980,...,pctLatino,pctUnder18,pctOver62,solar,approved1_block,approved2_block,approved5_block,since_yearalter1,since_yearalter2,popdensity
0,0,3000002.0,17683.475106,0.769877,1.0,17318250.0,170493.0,163894.0,0,0,...,6.4,20.1,3.2,0,0,0,0,27.0,6.0,0.000835
1,180,3413929.0,9431.126565,0.548905,1.0,12383114.0,102534.0,4686.0,0,0,...,6.4,20.1,3.2,0,0,0,0,2021.0,2021.0,0.000835
2,1,3000007.0,8192.77271,0.730845,1.0,371250.0,9585.0,9585.0,0,0,...,6.4,20.1,3.2,0,0,0,0,2021.0,2021.0,0.000835
3,2,3000010.0,43459.105586,0.729904,1.0,30760650.0,500000.0,500000.0,1,0,...,6.4,20.1,3.2,0,0,0,0,22.0,2021.0,0.000835
4,3,3000011.0,19246.260335,0.827548,1.0,1474200.0,21735.0,21735.0,0,1,...,6.4,20.1,3.2,0,0,0,0,9.0,2021.0,0.000835


In [4]:
full.dtypes

Unnamed: 0               int64
bin                    float64
roofarea               float64
svf                    float64
numbldgs               float64
assesstot              float64
bldgarea               float64
comarea                float64
yearbuilt_1900_1940      int64
yearbuilt_1940_1980      int64
yearbuilt_1980_2000      int64
yearbuilt_2000_2010      int64
yearbuilt_2010_2015      int64
yearbuilt_2015_2020      int64
landuse_2.0              int64
landuse_3.0              int64
landuse_4.0              int64
landuse_5.0              int64
landuse_6.0              int64
landuse_7.0              int64
landuse_8.0              int64
GEOID                    int64
tractarea              float64
pctBachelors           float64
medianIncome             int64
pctWhite               float64
pctAfricanAmerican     float64
pctAsian               float64
pctLatino              float64
pctUnder18             float64
pctOver62              float64
solar                    int64
approved

In [5]:
for i in range(2,8):
    print('------------------')
    print(len(full[full[f'landuse_{i}.0'] == 1]))
    print(len(full[(full[f'landuse_{i}.0'] == 1) & (full.solar == 1)]))

------------------
63002
780
------------------
3050
62
------------------
22645
75
------------------
4864
21
------------------
3094
45
------------------
1056
3


In [6]:
features = full.columns.difference(['solar','GEOID','Unnamed: 0','bin','since_yearalter1','since_yearalter2'])
features

Index(['approved1_block', 'approved2_block', 'approved5_block', 'assesstot',
       'bldgarea', 'comarea', 'landuse_2.0', 'landuse_3.0', 'landuse_4.0',
       'landuse_5.0', 'landuse_6.0', 'landuse_7.0', 'landuse_8.0',
       'medianIncome', 'numbldgs', 'pctAfricanAmerican', 'pctAsian',
       'pctBachelors', 'pctLatino', 'pctOver62', 'pctUnder18', 'pctWhite',
       'popdensity', 'roofarea', 'svf', 'tractarea', 'yearbuilt_1900_1940',
       'yearbuilt_1940_1980', 'yearbuilt_1980_2000', 'yearbuilt_2000_2010',
       'yearbuilt_2010_2015', 'yearbuilt_2015_2020'],
      dtype='object')

In [7]:
X = full[features]
y = full['solar']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)


In [8]:
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=42)),
])

In [9]:
weights = compute_sample_weight('balanced',y_train)
weights

array([0.50999797, 0.50999797, 0.50999797, ..., 0.50999797, 0.50999797,
       0.50999797])

In [10]:
svm_clf.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge', random_state=42))])

In [11]:
y_pred = svm_clf.predict(X_test)

In [12]:
count = 0
for solar in y_test:
    if solar == 1:
        count = count+1
        
count

1269

In [7]:
svm_p = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", svm.SVC(kernel="linC=1,random_state=42, class_weight='balanced',kernel='linear')),
])

In [None]:
svm_p.fit(X_train, y_train)

In [None]:
y_pred = svm_p.predict(X_test)

In [57]:
from matplotlib import pyplot as plt


In [14]:
def binary_classification_performance(y_test, y_pred, weights=None):
    tp, fp, fn, tn = confusion_matrix(y_test, y_pred, sample_weight=weights).ravel()
    accuracy = round(accuracy_score(y_pred = y_pred, y_true = y_test),2)
    precision = round(precision_score(y_pred = y_pred, y_true = y_test),2)
    recall = round(recall_score(y_pred = y_pred, y_true = y_test),2)
    f1_score = round(2*precision*recall/(precision + recall),2)
    specificity = round(tn/(tn+fp),2)
    npv = round(tn/(tn+fn),2)
    auc_roc = round(roc_auc_score(y_score = y_pred, y_true = y_test),2)


    result = pd.DataFrame({'Accuracy' : [accuracy],
                         'Precision (or PPV)' : [precision],
                         'Recall (senitivity or TPR)' : [recall],
                         'f1 score' : [f1_score],
                         'AUC_ROC' : [auc_roc],
                         'Specificty (or TNR)': [specificity],
                         'NPV' : [npv],
                         'True Positive' : [tp],
                         'True Negative' : [tn],
                         'False Positive':[fp],
                         'False Negative':[fn]})
    return result


In [105]:
test_weights = compute_sample_weight('balanced',y_test)


In [15]:
res = binary_classification_performance(y_test, y_pred)

In [84]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [107]:
res_weighted

Unnamed: 0,Accuracy,Precision (or PPV),Recall (senitivity or TPR),f1 score,AUC_ROC,Specificty (or TNR),NPV,True Positive,True Negative,False Positive,False Negative
0,0.91,0.15,0.78,0.25,0.85,0.9,0.78,30659.489671,26099.255319,2795.010329,7355.244681


In [16]:
res

Unnamed: 0,Accuracy,Precision (or PPV),Recall (senitivity or TPR),f1 score,AUC_ROC,Specificty (or TNR),NPV,True Positive,True Negative,False Positive,False Negative
0,0.99,1.0,0.63,0.77,0.81,1.0,0.63,65640,797,0,472
