In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression, chi2, mutual_info_regression, RFECV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor

In [99]:
x_train = pd.read_csv('x_train_high_correlation_removed.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')

In [100]:
y_train_array = y_train.astype(float).to_numpy()
y_train_array_flat = []
for sublist in y_train_array:
    for crime_rate in sublist:
        y_train_array_flat.append(crime_rate)

In [101]:
def get_scores(x_best, algo_name):
    importances = pd.Series(x_best.scores_, index = x_train.columns)
    importances = importances.nlargest(k).sort_values(ascending = True )
    plt.figure(figsize=(10,10))
    importances.plot(kind = 'barh')
    print('\n Top',k,'features selected by',algo_name,'algorithm:')
    plt.show()

In [102]:
# calculate the x best features based on different feature selection criteria
# x set to 25 for initial analysis
k = 25
x_best_f_regression = SelectKBest(f_regression, k = k).fit(x_train, y_train_array_flat)
x_best_chi2 = SelectKBest(chi2, k = k).fit(x_train, y_train_array_flat)
x_best_mutual = SelectKBest(mutual_info_regression, k = k).fit(x_train, y_train_array_flat)

MemoryError: 

In [None]:
get_scores(x_best_f_regression, 'f-regression')

In [None]:
get_scores(x_best_chi2, 'chi2')

In [None]:
get_scores(x_best_mutual, 'mutual info regression')

In [None]:
#calculate feature importance using extra trees classifier
tree_model = ExtraTreesClassifier()
tree_model.fit(x_train, y_train_array_flat)

In [None]:
def plot_extra_trees():
    importances = pd.Series(tree_model.feature_importances_, index = x_train.columns)
    importances = importances.nlargest(k).sort_values(ascending = True )
    plt.figure(figsize=(10,10))
    importances.plot(kind = 'barh')
    print('\n Top',k,'features selected by extra trees algorithm:')
    plt.show()

In [None]:
plot_extra_trees()

In [None]:
forest = RandomForestRegressor(n_estimators=500, criterion='mse', random_state=13, n_jobs=1)
forest.fit(x_train, y_train_array_flat)

In [None]:
def plot_random_forest():
    importances = pd.Series(forest.feature_importances_, index = x_train.columns)
    importances = importances.nlargest(k).sort_values(ascending = True )
    plt.figure(figsize=(10,10))
    importances.plot(kind = 'barh')
    print('\n Top',k,'features selected by extra trees algorithm:')
    plt.show()

In [None]:
# From these four methods of feature selection it is clear that some features should definitely be used
# But others vary.  A csv of the top 10 of each feature selection algorithm will be created for use
# with prediction algorithms going forward.

In [None]:
# calculate the x best features based on different feature selection criteria
# x set to select best 10 features, per each method
k = 10
x_best_f_regression = SelectKBest(f_regression, k = k).fit(x_train, y_train_array_flat)
x_best_chi2 = SelectKBest(chi2, k = k).fit(x_train, y_train_array_flat)
x_best_mutual = SelectKBest(mutual_info_regression, k = k).fit(x_train, y_train_array_flat)

In [None]:
get_scores(x_best_f_regression, 'f-regression')

In [None]:
get_scores(x_best_chi2, 'chi2')

In [None]:
get_scores(x_best_mutual, 'mutual info')

In [None]:
plot_extra_trees()

In [None]:
def create_csv(x_best, algo_name, split, data):
    x_selected = data.iloc[:,x_best]
    x_selected.to_csv('x_'+split+'_' + algo_name + '.csv', index = False)

In [None]:
create_csv(x_best_f_regression.get_support(indices=True), 'f_regression', 'train',x_train)
create_csv(x_best_chi2.get_support(indices=True), 'chi2','train',x_train)
create_csv(x_best_mutual.get_support(indices=True), 'mutual_info','train',x_train)
create_csv(x_best_f_regression.get_support(indices=True), 'f_regression', 'test',x_test)
create_csv(x_best_chi2.get_support(indices=True), 'chi2','test',x_test)
create_csv(x_best_mutual.get_support(indices=True), 'mutual_info','test',x_test)

In [None]:
x_train_selected = x_train[pd.Series(tree_model.feature_importances_, index = x_train.columns).nlargest(k).index]
x_train_selected.to_csv('x_train_extra_trees.csv', index = False)
x_test_selected = x_test[pd.Series(tree_model.feature_importances_, index = x_train.columns).nlargest(k).index]
x_test_selected.to_csv('x_test_extra_trees.csv', index = False)

In [None]:
# It is noted that there are not many business related variables selected by these algorithms.
# This suggests there is possibly little potential in using them to make predictions, however
# without directly attempting it is not a certainty.  As such, two additional feature selected
# datasets will be created.  Of the ten features in each, one will contain 50% business features
# and the other 100% business features.  Since the extra trees classifer ranked a business variable
# top, the results of this algorithm will be used to inform the manual construction of these datasets.

In [None]:
top_5_business_features = ['Number of businesses', 'Last 28 days closures', 'Last 14 days closures',
                          'Last 7 days openings', 'Last 7 days closures']
top_5_crime_features = ['Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
                        'Reports 4 days ago', 'Reports 2 days ago']
top_10 = np.concatenate((top_5_business_features,top_5_crime_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_equal_crime_and_business.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_equal_crime_and_business.csv', index = False)

In [None]:
additional_5_business_features = ['Number of openings','Openings 14 days ago','Openings 365 days ago', 
                                  'Closures 365 days ago', 'Openings 30 days ago']
top_10 = np.concatenate((top_5_business_features,additional_5_business_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_all_business.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_all_business.csv', index = False)

In [None]:
sel_methods = np.array(['','_f_regression','_chi2','_mutual_info','_extra_trees','_equal_crime_and_business','_all_business'])

In [None]:
print(arr)

In [None]:
file = open("Selection Methods","wb")

In [None]:
np.save(file,sel_methods)

In [None]:
file.close