In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression, chi2, mutual_info_regression, RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split

<h3> Open tuning test data and split to test and train <h3>

In [111]:
df = pd.read_csv('tuning_test_data.csv')

In [112]:
y = df[['Todays Reports']]
x = df.drop('Todays Reports',axis=1)

In [113]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)
x_sample, x_remainder, y_sample, y_remainder = train_test_split(x_train, y_train, test_size = 0.75, random_state = 0)

In [114]:
y_train.to_csv('y_train_tuning_tests.csv', index = False)
y_test.to_csv('y_test_tuning_tests.csv', index = False)
y_sample.to_csv('y_train_tuning_tests_sample.csv', index = False)

<h3> Create feature selected datasets from tuning test data <h3>

In [115]:
top_10_f_regression = ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']
x_train_selected = x_train[top_10_f_regression]
x_train_selected.to_csv('x_train_tuning_tests_f_regression.csv', index = False)
x_test_selected = x_test[top_10_f_regression]
x_test_selected.to_csv('x_test_tuning_tests_f_regression.csv', index = False)
x_sample_selected = x_sample[top_10_f_regression]
x_sample_selected.to_csv('x_train_tuning_tests_f_regression_sample.csv', index = False)

In [116]:
top_10_chi2 = ['South of Market', 'Mission', 'Tenderloin', 'Number of businesses', 
               'Downtown / Union Square', 'Civic Center', 'Reports 365 days ago',
               'Reports 1 day ago','Reports 2 days ago','Reports 14 days ago']
x_train_selected = x_train[top_10_chi2]
x_train_selected.to_csv('x_train_tuning_tests_chi2.csv', index = False)
x_test_selected = x_test[top_10_chi2]
x_test_selected.to_csv('x_test_tuning_tests_chi2.csv', index = False)
x_sample_selected = x_sample[top_10_chi2]
x_sample_selected.to_csv('x_train_tuning_tests_chi2_sample.csv', index = False)

In [117]:
top_10_ada = ['Reports 365 days ago', 'Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
               'Reports 2 days ago', 'Reports 7 days ago', 'Number of businesses',
               'Reports 4 days ago','Reports 5 days ago','Closures 365 days ago']
x_train_selected = x_train[top_10_ada]
x_train_selected.to_csv('x_train_tuning_tests_adaboost.csv', index = False)
x_test_selected = x_test[top_10_ada]
x_test_selected.to_csv('x_test_tuning_tests_adaboost.csv', index = False)
x_sample_selected = x_sample[top_10_ada]
x_sample_selected.to_csv('x_train_tuning_tests_adaboost_sample.csv', index = False)

In [118]:
top_5_business_features = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures']
top_5_crime_features = ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 4 days ago', 
                        'Reports 30 days ago', 'Reports 7 days ago']
top_10 = np.concatenate((top_5_business_features,top_5_crime_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_tuning_tests_equal_crime_and_business.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_tuning_tests_equal_crime_and_business.csv', index = False)
x_sample_selected = x_sample[top_10]
x_sample_selected.to_csv('x_train_tuning_tests_equal_crime_and_business_sample.csv', index = False)

In [119]:
additional_5_business_features = ['Number of openings','Openings 4 days ago','Openings 1 day ago', 
                                  'Openings 7 days ago', 'Openings 2 days ago']
top_10 = np.concatenate((top_5_business_features,additional_5_business_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_tuning_tests_all_business.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_tuning_tests_all_business.csv', index = False)
x_sample_selected = x_sample[top_10]
x_sample_selected.to_csv('x_train_tuning_tests_all_business_sample.csv', index = False)

<h3> Tests to confirm correct dataset generation<h3>

In [120]:
expected_num_rows_test = 205
expected_num_rows_train = 614
expected_num_rows_sample = 153

In [121]:
def test_dataset(dataset_name,expected_num_records,expected_columns,zeros_possible):
    alt_expected = 1
    if zeros_possible:
        alt_expected = 0
    df = pd.read_csv(dataset_name)
    assert expected_num_records == len(df), "Expected " + str(expected_num_records) + " records but got " + str(len(df))
    print("Number of records in dataset is as expected.")
    actual_columns = df.columns
    for i in range(len(expected_columns)):
        assert expected_columns[i] == actual_columns[i], "Expected column " + expected_columns[i] + " but got " + actual_columns[i]
    print("Column names of dataset are as expected.")
    for record_num in range(0,len(df.index)):
        j = 0
        while j < len(expected_columns) - 1:
            actual = df[expected_columns[j]].iloc[record_num]
            expected = 1
            assert actual == expected or actual == alt_expected, "Value on row " + str(record_num) + " for " + str(expected_columns[j]) + " column is " + str(actual) + " but " + str(expected) + " expected"
            j += 1
    print("All values in dataset are as expected.")
    print("\nAll tests completed successfully.")

<h4> Test f regression datasets <h4>

In [122]:
f_reg_expected_columns = ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']

In [123]:
test_dataset('x_train_tuning_tests_f_regression.csv',expected_num_rows_train,f_reg_expected_columns,zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [124]:
test_dataset('x_test_tuning_tests_f_regression.csv',expected_num_rows_test,f_reg_expected_columns,zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [125]:
test_dataset('x_train_tuning_tests_f_regression_sample.csv',expected_num_rows_sample,f_reg_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


<h4> Test chi2 datasets <h4>

In [126]:
chi_2_expected_columns = ['South of Market', 'Mission', 'Tenderloin', 'Number of businesses', 
               'Downtown / Union Square', 'Civic Center', 'Reports 365 days ago',
               'Reports 1 day ago','Reports 2 days ago','Reports 14 days ago']

In [127]:
test_dataset('x_train_tuning_tests_chi2.csv',expected_num_rows_train,chi_2_expected_columns,zeros_possible = True)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [128]:
test_dataset('x_test_tuning_tests_chi2.csv',expected_num_rows_test,chi_2_expected_columns,zeros_possible = True)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [129]:
test_dataset('x_train_tuning_tests_chi2_sample.csv',expected_num_rows_sample,chi_2_expected_columns,zeros_possible = True)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


<h4> Test adaboost datasets <h4>

In [130]:
adaboost_expected_columns = ['Reports 365 days ago', 'Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
               'Reports 2 days ago', 'Reports 7 days ago', 'Number of businesses',
               'Reports 4 days ago','Reports 5 days ago','Closures 365 days ago']

In [131]:
test_dataset('x_train_tuning_tests_adaboost.csv',expected_num_rows_train,adaboost_expected_columns,zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [132]:
test_dataset('x_test_tuning_tests_adaboost.csv',expected_num_rows_test,adaboost_expected_columns,zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [133]:
test_dataset('x_train_tuning_tests_adaboost_sample.csv',expected_num_rows_sample,adaboost_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


<h4> Test equal crime and business datasets <h4>

In [134]:
equal_expected_columns = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures', 'Reports 1 day ago', 'Reports 2 days ago', 
                          'Reports 4 days ago', 'Reports 30 days ago', 'Reports 7 days ago']

In [135]:
test_dataset('x_train_tuning_tests_equal_crime_and_business.csv',expected_num_rows_train,equal_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [136]:
test_dataset('x_test_tuning_tests_equal_crime_and_business.csv',expected_num_rows_test,equal_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [137]:
test_dataset('x_train_tuning_tests_equal_crime_and_business_sample.csv',expected_num_rows_sample,equal_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


<h4> Test all business datasets <h4>

In [138]:
business_expected_columns = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Number of openings','Openings 4 days ago',
                             'Openings 1 day ago', 'Openings 7 days ago', 'Openings 2 days ago']

In [139]:
test_dataset('x_train_tuning_tests_all_business.csv',expected_num_rows_train,business_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [140]:
test_dataset('x_test_tuning_tests_all_business.csv',expected_num_rows_test,business_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [141]:
test_dataset('x_train_tuning_tests_all_business_sample.csv',expected_num_rows_sample,business_expected_columns,
             zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


<h4> Test y datasets <h4>

In [142]:
y_expected_columns = ['Todays Reports']

In [143]:
test_dataset('y_train_tuning_tests.csv',expected_num_rows_train,y_expected_columns, zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [144]:
test_dataset('y_test_tuning_tests.csv',expected_num_rows_test,y_expected_columns, zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.


In [145]:
test_dataset('y_train_tuning_tests_sample.csv',expected_num_rows_sample,y_expected_columns, zeros_possible = False)

Number of records in dataset is as expected.
Column names of dataset are as expected.
All values in dataset are as expected.

All tests completed successfully.
