In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## for correlation matrices
import seaborn as sns
%matplotlib inline
## for linear models
import statsmodels.api as sm
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ConfusionMatrix


from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report





In [4]:
# Must download from http://web.stanford.edu/group/deepsolar/deepsolar_tract.csv and delete the first ","
df = pd.read_csv("../data/deepsolar_tract.csv", encoding = "utf-8")


In [5]:
# Remove all deepsolar inputs

df = df.drop(columns=['solar_system_count'], axis = 1)
df = df.drop(columns=['total_panel_area'], axis = 1)
df = df.drop(columns=['solar_panel_area_per_capita'], axis =1)

df = df.drop(columns=['solar_panel_area_divided_by_area'], axis = 1)
df = df.drop(columns=['tile_count_residential'], axis = 1)
df = df.drop(columns=['tile_count_nonresidential'], axis = 1)
df = df.drop(columns=['solar_system_count_residential'], axis =1)

df = df.drop(columns=['solar_system_count_nonresidential'], axis = 1)
df = df.drop(columns=['total_panel_area_residential'], axis = 1)
df = df.drop(columns=['total_panel_area_nonresidential'], axis = 1)
df = df.drop(columns=['number_of_solar_system_per_household'], axis =1)

# Remove all unique identifiers, objects, and booleans
df = df.drop(columns=['county',
                      'state',
                     'electricity_price_transportation',
                     'voting_2016_dem_win',
                     'voting_2012_dem_win',
                     'fips'])



In [6]:
# tilesLargerThanZero = range(1,4469)
df['has_tiles'] = (df.tile_count > 0).mul(1)

In [7]:
df = df.drop('tile_count', axis = 1)

In [8]:
df = df.dropna()

# Balance

In [9]:
# Define X and y
y = df['has_tiles']
X = df.drop('has_tiles', axis = 1)
# Split the data into training and test sets
# from sklearn.model_selection import train_test_split

#ADD STRATIFIED HERE

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
#print(y_train.value_counts(),'\n\n', y_test.value_counts())

# SMOTE
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
print(y_train.value_counts(),'\n\n', y_test.value_counts())

1    31209
0    31209
Name: has_tiles, dtype: int64 

 1    10392
0     3133
Name: has_tiles, dtype: int64


# Scale

In [10]:
# col_names = list(df.columns)
# scaler = preprocessing.MinMaxScaler()
# Xtrain = scaler.fit_transform(X_train)
# Xtest = scaler.transform(X_test)

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))


StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
type(X_train)

pandas.core.frame.DataFrame

# Feature Selection

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=.0002, solver = 'saga', penalty='l1'))
sel_.fit(X_train, y_train)

In [None]:
selected_feat= X.columns[(sel_.get_support())]

In [None]:
selected_feat

# OR

In [11]:
sel_ = SelectFromModel(LogisticRegression(C=.001, solver = 'saga', penalty='l1'))
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.001, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None, solver='saga',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [12]:
remaining_feats = X_train.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]
remaining_feats

Index(['average_household_income', 'education_bachelor',
       'education_population', 'population', 'population_density',
       'race_two_more', 'education_high_school_graduate_rate',
       'education_bachelor_rate', 'education_master_rate',
       'education_professional_school_rate', 'heating_fuel_coal_coke_rate',
       'heating_fuel_other_rate', 'electricity_price_industrial',
       'electricity_consume_commercial', 'electricity_consume_total',
       'housing_unit_median_gross_rent', 'lon', 'heating_design_temperature',
       'relative_humidity', 'age_more_than_85_rate',
       'occupation_construction_rate', 'occupation_administrative_rate',
       'occupation_arts_rate', 'occupation_agriculture_rate',
       'occupancy_vacant_rate', 'mortgage_with_rate',
       'transportation_bicycle_rate', 'travel_time_less_than_10_rate',
       'travel_time_10_19_rate', 'travel_time_40_59_rate',
       'travel_time_average', 'incentive_count_nonresidential',
       'incentive_residentia