In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## for correlation matrices
import seaborn as sns
%matplotlib inline
## for linear models
import statsmodels.api as sm
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ConfusionMatrix


from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier



%run ../pyfiles/data_cleaning.py
%run ../pyfiles/unmetDemand.py



In [2]:
df = pd.read_csv("../data/deepsolar_tract.csv", encoding = "utf-8")


In [3]:
df = drop_redundant_columns(df)

In [4]:
df = create_has_tiles_target_column(df)

In [5]:
df.columns

Index(['tile_count', 'fips', 'average_household_income', 'education_bachelor',
       'education_college', 'education_doctoral',
       'education_high_school_graduate', 'education_less_than_high_school',
       'education_master', 'education_population',
       ...
       'incentive_residential_state_level',
       'incentive_nonresidential_state_level', 'net_metering', 'feedin_tariff',
       'cooperate_tax', 'property_tax', 'sales_tax', 'rebate',
       'avg_electricity_retail_rate', 'has_tiles'],
      dtype='object', length=132)

In [6]:
df = df.dropna()
df = df.drop('tile_count', axis = 1)

In [7]:
# Define X and y
y = df['has_tiles']
X = df.drop('has_tiles', axis = 1)
# Split the data into training and test sets
# from sklearn.model_selection import train_test_split

#ADD STRATIFIED HERE
# Also Try Oversampling and Undersampling

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
#print(y_train.value_counts(),'\n\n', y_test.value_counts())

# SMOTE
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
print(y_train.value_counts(),'\n\n', y_test.value_counts())

1    31209
0    31209
Name: has_tiles, dtype: int64 

 1    10392
0     3133
Name: has_tiles, dtype: int64


In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
pipe = Pipeline([('classifier', RandomForestClassifier())])

search_space = [
#                 {'classifier': [DecisionTreeClassifier()],
#                  'classifier__max_depth': [2, 3, 4, 5, 8],
#                  'classifier__min_samples_split': [2, 5, 10, 15, 100],
#                 'classifier__min_samples_leaf':[1, 2, 5, 10]}
              {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_depth': [5, 8, 15, 25, 30], 
                 'classifier__min_samples_split':[1, 2, 5, 10, 15, 100],
                 'classifier__min_samples_leaf': [1, 2, 5, 10],
                 'classifier__max_features': ['log2', 'sqrt']}
    
    #Change from None to 20 for max features
    
#                {'classifier': [SVC()],
#                'classifier__C': [.001, .01, 0.1, 1, 10, 100, 1000],
#                    'classifier__gamma': ['auto','scale'],
#                    'classifier__class_weight':['balanced', None]},
#                {'classifier':[KNeighborsClassifier()],
#                    'classifier__n_neighbors':[2,4,8,16],
#                    'classifier__p':[2,3]}
               ]

clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs = -1)
#Added n_jobs above to use all processors (to speed up grid search)

best_model = clf.fit(X_train, y_train)

best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
parameters = {'penalty':['l1', 'l2'], 'C':[.001, .005, .0005, .0009]}
#clf = classifier
clf = GridSearchCV(LogisticRegression(solver='saga'), parameters, cv = 5)
clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.005, 0.0005, 0.0009],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [10]:
clf.best_params_

{'C': 0.005, 'penalty': 'l2'}

ValueError: Expected array-like (array or non-string sequence), got None

In [11]:
best_clf=clf.best_estimator_

In [12]:
best_clf

LogisticRegression(C=0.005, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
falsePositives = isFalsePositive(df, X_test, y_test, best_clf)

In [14]:
buy = isOpportunityZone(falsePositives, scaler = scaler)

In [15]:
buy

Unnamed: 0,key_0,fips,average_household_income,education_bachelor,education_college,education_doctoral,education_high_school_graduate,education_less_than_high_school,education_master,education_population,...,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,State,County,Census_Tract_Number,Tract_Type,ACS_Data_Source
0,27047180600,2.704718e+10,44522.559294,196.0,1037.0,8.000000e+00,946.0,541.0,48.0,2833.0,...,0.0,25.0,12.0,0.0,9.46,Minnesota,Freeborn,27047180600,Low-Income Community,2011-2015
1,33017080203,3.301708e+10,66078.894472,243.0,95.0,7.500000e+01,104.0,13.0,237.0,789.0,...,0.0,41.0,0.0,8.0,15.08,New Hampshire,Strafford,33017080203,Low-Income Community,2011-2015
2,51670820600,5.167082e+10,46826.293996,238.0,1074.0,3.552714e-15,1122.0,799.0,83.0,3325.0,...,0.0,40.0,0.0,0.0,9.12,Virginia,Hopewell,51670820600,Low-Income Community,2011-2015
3,36063020500,3.606302e+10,25052.004111,110.0,326.0,2.200000e+01,605.0,438.0,24.0,1536.0,...,0.0,40.0,12.0,7.0,15.32,New York,Niagara,36063020500,Low-Income Community,2011-2015
4,48439106201,4.843911e+10,37826.919901,42.0,715.0,3.552714e-15,972.0,826.0,24.0,2586.0,...,0.0,36.0,0.0,0.0,8.66,Texas,Tarrant,48439106201,Low-Income Community,2011-2015
5,48061012102,4.806101e+10,33427.402597,71.0,635.0,6.000000e+00,822.0,1498.0,15.0,3058.0,...,0.0,36.0,0.0,0.0,8.66,Texas,Cameron,48061012102,Low-Income Community,2011-2015
6,44007015200,4.400702e+10,24558.085382,166.0,340.0,3.200000e+01,434.0,855.0,70.0,1907.0,...,16.0,37.0,12.0,0.0,15.02,Rhode Island,Providence,44007015200,Low-Income Community,2011-2015
7,6029005507,6.029006e+09,63188.167939,445.0,1766.0,3.552714e-15,1474.0,558.0,221.0,4479.0,...,0.0,38.0,0.0,10.0,14.72,California,Kern,6029005507,Low-Income Community,2011-2015
8,26125175000,2.612518e+10,49245.068394,209.0,446.0,3.552714e-15,976.0,346.0,98.0,2075.0,...,0.0,0.0,0.0,0.0,11.00,Michigan,Oakland,26125175000,Low-Income Community,2011-2015
9,37179020602,3.717902e+10,53136.660389,326.0,715.0,2.900000e+01,885.0,1041.0,95.0,3115.0,...,0.0,9.0,0.0,0.0,9.26,North Carolina,Union,37179020602,Low-Income Community,2011-2015


In [None]:
falsePositives.columns