In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## for correlation matrices
import seaborn as sns
%matplotlib inline
## for linear models
import statsmodels.api as sm
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from yellowbrick.classifier import ConfusionMatrix

from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

# from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report

%run ../pyfiles/data_cleaning.py

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
# Set global random seed
np.random.seed(123)

### Import data and drop redundant data (rates)

In [7]:
# import data
orig = pd.read_csv('../../data/deepsolar_tract.csv', encoding = 'latin-1')
orig.drop('Unnamed: 0', axis = 1, inplace = True)

# fips is a unique identifier
# Set fips as index 

orig.set_index(keys = 'fips', inplace = True)

### Add more data on opportunity zones

In [8]:
ozdf = pd.read_csv("../data/ListOfOppurtunityZonesWithoutAKorHI.csv", encoding = "utf-8")
ozdf = ozdf.rename(columns={"Census Tract Number": "Census_Tract_Number", "Tract Type": "Tract_Type", "ACS Data Source": "ACS_Data_Source"})

In [9]:
df = orig.merge(ozdf, left_on = orig.index, right_on = ozdf.Census_Tract_Number)

In [10]:
df = drop_redundant_columns(df)

In [11]:
# Create our target column 'has_tiles', and drop additional redundant columns

df = create_has_tiles_target_column(df)

In [12]:
# # Figure out which variables are highly correlated, remove the most correlated ones one by one

# corr = pd.DataFrame((df.corr() > 0.8).sum())
# corr.sort_values(by = 0, ascending = False)[0:5]

In [13]:
# # Add highly correlated variables to list 'to_drop'
# to_drop = ['poverty_family_count','education_population','population', 'household_count','housing_unit_occupied_count', 'electricity_price_overall']

In [14]:
# # Drop highly colinear variables
# df = df.drop(to_drop, axis = 1)

### Checking for missing values

In [80]:
nulls = pd.DataFrame(df.isna().sum())
nulls.columns = ["missing"]
nulls[nulls['missing']>0].head()

Unnamed: 0,missing
average_household_income,886
gini_index,847
land_area,24
per_capita_income,643
population_density,316


In [15]:
# drop all missing values
df = df.dropna(axis = 0)

### Train test split

In [16]:
X = df.drop('has_tiles', axis = 1)
y = df['has_tiles']

In [19]:
y.value_counts()

1    4505
0    1381
Name: has_tiles, dtype: int64

In [17]:
X_train, y_train, x_test, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### SMOTE!

In [18]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
print(y_train.value_counts(),'\n\n', y_test.value_counts())

TypeError: '<' not supported between instances of 'str' and 'int'

### Scale Data 

### Steps for Modeling


Gridsearch:
Try -  All our variables in logisitic regression
 - params: C and penalty
 
Gridsearch:
Try - All variables in decision trees
 - params: max_depth (5, 8, 15, 25, 30), min_samples_split (1, 2, 5, 10, 15, 100), min_samples_leaf (1, 2, 5, 10)
 - max_depth : how deep the tree is
 - min_samples_split: minimum number of samples required to split an internal node
 - min_samples_leaf: he minimum number of samples that we want a leaf node to contain


Try - Random forest
 - params: n_estimators (120, 300, 500, 800), max_depth(5, 8, 15, 25, 30), min_samples_split(1, 2, 5, 10, 15, 100), min_samples_leaf(1, 2, 5, 10), max_features (log2, sqrt, None)

Try - SVM
 - params: C(.001, .01, 0.1, 1, 10, 100, 1000), gamma ('auto','scale'), class_weight ('balanced', None)

Try - KNN
 - params: n_neighbors (2, 4, 8, 16), p (2,3)

Then try all again with PCA


In [12]:
from sklearn.pipeline import Pipeline

In [None]:
# Decision Tree Classifier
parameters = {'max_depth':[5, 8, 15, 25, 30], 'min_samples_split':[1, 2, 5, 10, 15, 100], 'min_samples_leaf':[1, 2, 5, 10]}
#clf = classifier
dt = GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5)
dt.fit(X_train, y_train)