In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression, chi2, mutual_info_regression, RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor

In [2]:
x_train = pd.read_csv('x_train_revised.csv')
y_train = pd.read_csv('y_train_revised.csv')
x_test = pd.read_csv('x_test_revised.csv')
x_sample = pd.read_csv('x_train_sample_revised.csv')

In [3]:
y_train_array = y_train.astype(float).to_numpy()
y_train_array_flat = []
for sublist in y_train_array:
    for crime_rate in sublist:
        y_train_array_flat.append(crime_rate)

In [4]:
top_5_business_features = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures']
top_5_crime_features = ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 4 days ago', 
                        'Reports 30 days ago', 'Reports 7 days ago']
top_10 = np.concatenate((top_5_business_features,top_5_crime_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_equal_crime_and_business_revised.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_equal_crime_and_business_revised.csv', index = False)
x_sample_selected = x_sample[top_10]
x_sample_selected.to_csv('x_train_equal_crime_and_business_sample_revised.csv', index = False)

In [5]:
additional_5_business_features = ['Number of openings','Openings 4 days ago','Openings 1 day ago', 
                                  'Openings 7 days ago', 'Openings 2 days ago']
top_10 = np.concatenate((top_5_business_features,additional_5_business_features))
x_train_selected = x_train[top_10]
x_train_selected.to_csv('x_train_all_business_revised.csv', index = False)
x_test_selected = x_test[top_10]
x_test_selected.to_csv('x_test_all_business_revised.csv', index = False)
x_sample_selected = x_sample[top_10]
x_sample_selected.to_csv('x_train_all_business_sample_revised.csv', index = False)

<h3>  Testing starts here <h3>

In [6]:
x_train = pd.read_csv('x_train_revised.csv')
x_test = pd.read_csv('x_test_revised.csv')
x_sample = pd.read_csv('x_train_sample_revised.csv')

In [7]:
expected_train_length = len(x_train)
expected_test_length = len(x_test)
expected_sample_length = len(x_sample)

In [8]:
expected_equal_features = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                           'Last 14 days closures', 'Last 7 days closures', 'Reports 1 day ago', 'Reports 2 days ago', 
                           'Reports 4 days ago', 'Reports 30 days ago', 'Reports 7 days ago']
expected_business_features = ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                              'Last 14 days closures', 'Last 7 days closures', 'Number of openings',
                              'Openings 4 days ago','Openings 1 day ago', 'Openings 7 days ago', 'Openings 2 days ago']

In [9]:
def test_file(filename, expected_length, expected_features):
    df = pd.read_csv(filename)
    original_columns = df.columns
    assert len(df) == expected_length, "Length of dataset is different to expected"
    assert len(original_columns) == len(expected_features), "Length of features is different to expected"
    for col in original_columns:
        assert col in expected_features, "Features not as expected."
    for col in expected_features:
        assert col in original_columns, "Features not as expected!"
    print("All tests completed successfully")

In [10]:
filename = 'x_train_equal_crime_and_business_revised.csv'
test_file(filename, expected_train_length, expected_equal_features)

All tests completed successfully


In [11]:
filename = 'x_test_equal_crime_and_business_revised.csv'
test_file(filename, expected_test_length, expected_equal_features)

All tests completed successfully


In [12]:
filename = 'x_train_equal_crime_and_business_sample_revised.csv'
test_file(filename, expected_sample_length, expected_equal_features)

All tests completed successfully


In [13]:
filename = 'x_train_all_business_revised.csv'
test_file(filename, expected_train_length, expected_business_features)

All tests completed successfully


In [14]:
filename = 'x_test_all_business_revised.csv'
test_file(filename, expected_test_length, expected_business_features)

All tests completed successfully


In [15]:
filename = 'x_train_all_business_sample_revised.csv'
test_file(filename, expected_sample_length, expected_business_features)

All tests completed successfully
