In [527]:
import pandas as pd
from datetime import date, datetime, timedelta
import pickle
import numpy as np
from math import sqrt
import copy
import matplotlib.pyplot as plt
%matplotlib inline
import math

<h3> Hotspot calculation (Also used in GUI) <h3>

In [528]:
def open_file(file_name):
    try:
        data_file = pd.read_csv(file_name + CSV_FILE_EXTENSION)
    except:
        print("File not found")
        return pd.DataFrame()
    return data_file

In [529]:
# Define file variables
GUI_FILE_NAME = "CrimeGUI/data"
CSV_FILE_EXTENSION = ".csv"
df = pd.DataFrame()
FILTERED_DATA = pd.DataFrame()
MODEL_FEATURE_SEPARATOR = "_"
MODEL_PATH = "CrimeGUI/Models/"
COMMA_SPACE = ", "
INCIDENT_COL_KEY = "Todays Reports"
NEIGHBOURHOOD_COL_KEY = "Neighborhood"
DATE_COL_KEY = "Date"

In [530]:
# Define feature selection variables
file = open("Selection Methods","rb")
sel_methods = np.load(file)
F_REGRESSION_NAME = "F-Regression"
F_REGRESSION_FILE_TAG = "f_regression"
CHI2_NAME = "Chi-Squared"
CHI2_FILE_TAG = "chi2"
ADABOOST_NAME = "AdaBoost"
ADABOOST_FILE_TAG = "adaboost"
EQUAL_DATA_NAME = "Equal Selection"
EQUAL_DATA_FILE_TAG = "equal_crime_and_business"
ALL_BUS_NAME = "All Business"
ALL_BUS_FILE_TAG = "all_business"
FEATURE_SELECTION = [F_REGRESSION_NAME, CHI2_NAME, ADABOOST_NAME, EQUAL_DATA_NAME, ALL_BUS_NAME]
FEATURES = {
    F_REGRESSION_NAME : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago'],
    CHI2_NAME : ['South of Market', 'Mission', 'Tenderloin', 'Number of businesses', 
               'Downtown / Union Square', 'Civic Center', 'Reports 365 days ago',
               'Reports 1 day ago','Reports 2 days ago','Reports 14 days ago'],
    ADABOOST_NAME : ['Reports 365 days ago', 'Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
               'Reports 2 days ago', 'Reports 7 days ago', 'Number of businesses',
               'Reports 4 days ago','Reports 5 days ago','Closures 365 days ago'],
    EQUAL_DATA_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Reports 1 day ago',
                      'Reports 2 days ago', 'Reports 4 days ago', 'Reports 30 days ago', 'Reports 7 days ago'],
    ALL_BUS_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Number of openings',
                   'Openings 4 days ago','Openings 1 day ago', 'Openings 7 days ago', 'Openings 2 days ago']
    }
FEATURE_FILE_TAGS = {
    F_REGRESSION_NAME : F_REGRESSION_FILE_TAG,
    CHI2_NAME : CHI2_FILE_TAG,
    ADABOOST_NAME : ADABOOST_FILE_TAG,
    EQUAL_DATA_NAME : EQUAL_DATA_FILE_TAG,
    ALL_BUS_NAME : ALL_BUS_FILE_TAG
    }
FEATURE_NAMES_BY_FILE_TAG = {
    F_REGRESSION_FILE_TAG : F_REGRESSION_NAME,
    CHI2_FILE_TAG : CHI2_NAME,
    ADABOOST_FILE_TAG : ADABOOST_NAME,
    EQUAL_DATA_FILE_TAG : EQUAL_DATA_NAME,
    ALL_BUS_FILE_TAG : ALL_BUS_NAME
}

In [531]:
# Define model variables
ANN_NAME = "Multi-Layer\nPerceptron"
ANN_FILE_TAG = "multi_layer_perceptron"
DECISION_TREE_NAME = "Decision Tree"
DECISION_TREE_FILE_TAG = "decision_tree"
ELASTIC_NET_NAME = "Elastic Net"
ELASTIC_NET_FILE_TAG = "elastic_net"
LASSO_NAME = "Lasso"
LASSO_FILE_TAG = "lasso"
LINERAR_REGRESSION_NAME = "Linear \nRegression"
LINERAR_REGRESSION_FILE_TAG = "linear_regression"
RANDOM_FOREST_NAME = "Random \nForest"
RANDOM_FOREST_FILE_TAG = "random_forest"
RIDGE_REGRESSION_NAME = "Ridge \nRegression"
RIDGE_REGRESSION_FILE_TAG = "ridge_regression"
SVM_NAME = "SVM"
SVM_FILE_TAG = "svm"
MODELS = [ANN_NAME,
          DECISION_TREE_NAME,
          ELASTIC_NET_NAME,
          LASSO_NAME,
          LINERAR_REGRESSION_NAME,
          RANDOM_FOREST_NAME,
          RIDGE_REGRESSION_NAME,
          SVM_NAME]
MODEL_FILE_TAGS = {
    ANN_NAME : ANN_FILE_TAG,
    DECISION_TREE_NAME : DECISION_TREE_FILE_TAG,
    ELASTIC_NET_NAME : ELASTIC_NET_FILE_TAG,
    LASSO_NAME : LASSO_FILE_TAG,
    LINERAR_REGRESSION_NAME: LINERAR_REGRESSION_FILE_TAG,
    RANDOM_FOREST_NAME: RANDOM_FOREST_FILE_TAG,
    RIDGE_REGRESSION_NAME: RIDGE_REGRESSION_FILE_TAG,
    SVM_NAME : SVM_FILE_TAG
    }
MODEL_NAMES_BY_FILE_TAG = {
    ANN_FILE_TAG : ANN_NAME,
    DECISION_TREE_FILE_TAG : DECISION_TREE_NAME,
    ELASTIC_NET_FILE_TAG : ELASTIC_NET_NAME,
    LASSO_FILE_TAG : LASSO_NAME,
    LINERAR_REGRESSION_FILE_TAG : LINERAR_REGRESSION_NAME,
    RANDOM_FOREST_FILE_TAG: RANDOM_FOREST_NAME,
    RIDGE_REGRESSION_FILE_TAG : RIDGE_REGRESSION_NAME,
    SVM_FILE_TAG : SVM_NAME
}
algorithms = ['multi_layer_perceptron',
                  'decision_tree',
                  'elastic_net',
                  'lasso',
                  'linear_regression',
                  'random_forest',
                  'ridge_regression',
                  'svm']
algorithm_display_names = ['Multi\nLayer\nPerceptron',
                               'Decision\nTree',
                               'Elastic Net',
                               'Lasso',
                               'Linear\nRegression',
                               'Random\nForest',
                               'Ridge\nRegression',
                               'SVM']
feature_select_display_names = ['F Regression',
                               'Chi2',
                               'Adaboost',
                               'Equal crime\nand business',
                               'All Business']

In [532]:
# define neighbourhoods which contain other neighbourhoods
PARENT_NEIGHBOURHOODS = {
    "Central Waterfront" : ["Dogpatch"],
    "Eureka Valley" : ["Dolores Heights","Castro"],
    "Buena Vista" : ["Ashbury Heights"],
    "Cole Valley" : ["Parnassus Heights"],
    "Bayview" : ["Apparel City", "Produce Market"],
    "Russian Hill" : ["Aquatic Park / Ft. Mason"],
    "North Beach" : ["Bret Harte"],
    "Western Addition" : ["Cathedral Hill", "Japantown"],
    "Downtown / Union Square" : ["Fairmount", "Chinatown", "Lower Nob Hill", "Polk Gulch"],
    "Mission Terrace" : ["Cayuga"],
    "Northern Waterfront" : ["Fishermans Wharf"],
    "Bernal Heights" : ["Holly Park", "Peralta Heights", "St. Marys Park"],
    "Hunters Point" : ["India Basin"],
    "Forest Hill" : ["Laguna Honda"],
    "Hayes Valley" : ["Lower Haight"],
    "Portola" : ["McLaren Park", "University Mound"],
    "South of Market" : ["Mint Hill"],
    "Stonestown" : ["Parkmerced"],
    "Presidio Heights" : ["Presidio Terrace"],
    "South Beach" : ["Rincon Hill"],
    "Potrero Hill" : ["Showplace Square"],
    "Visitacion Valley" : ["Sunnydale"],
    "Lincoln Park / Ft. Miley" : ["Sutro Heights"],
    "Cow Hollow" : ["Union Street"]
    }

In [533]:
# obtain hotspots based on date and model and feature key
def get_hotspots(data, model_key, features_key,calendar_date):
    x_data = data.loc[data[DATE_COL_KEY].str.contains(calendar_date)]
    neighbourhoods_data = pd.DataFrame(x_data[NEIGHBOURHOOD_COL_KEY])
    neighbourhoods_data.reset_index(drop=True, inplace=True)
    y_data = pd.DataFrame(x_data[INCIDENT_COL_KEY])
    y_data.reset_index(drop=True, inplace=True)
    features_selected = FEATURES[features_key]
    x_data = x_data[features_selected]
    return load_model(x_data, y_data, neighbourhoods_data, model_key, features_key)

In [534]:
# load machine learning model
def load_model(x_data, y_data, neighbourhoods_data, model_key, features_key):
    model_tag = MODEL_FILE_TAGS[model_key]
    feature_tag = FEATURE_FILE_TAGS[features_key]
    file_path = MODEL_PATH + model_tag + MODEL_FEATURE_SEPARATOR + feature_tag
    with open(file_path, 'rb') as f:
        model = pickle.load(f)
        return make_prediction(model, x_data, y_data, neighbourhoods_data)

In [535]:
# predict incident values
def make_prediction(model, x_data, y_data, neighbourhoods_data):
    y_predict = model.predict(x_data)
    y_actual,y_predict,neighbourhoods_data = merge_sub_neighbourhoods(y_data,y_predict,neighbourhoods_data)
    total_predictions = len(y_predict)
    neighbourhoods_data.reset_index(drop=True, inplace=True)
    prediction_neighbourhoods = neighbourhoods_data[NEIGHBOURHOOD_COL_KEY].to_numpy()
    actual_neighbourhoods = prediction_neighbourhoods.copy()
    indexes = y_actual.argsort()
    y_actual = np.flip(y_actual[indexes])
    actual_neighbourhoods = np.flip(actual_neighbourhoods[indexes])
    indexes = y_predict.argsort()
    y_predict = np.flip(y_predict[indexes])
    prediction_neighbourhoods = np.flip(prediction_neighbourhoods[indexes])
    return y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods

In [536]:
# convert negative value to zero if necessary
def get_non_negative_value(value):
    if value < 0:
        return 0
    else:
        return value

In [537]:
# sum incident values for neighbourhoods containing other neighbourhoods
def merge_sub_neighbourhoods(y_data,y_predict,neighbourhoods_data):
    y_data = pd.DataFrame(y_data).to_numpy().flatten()
    indexes_to_remove = []
    for parent_key in PARENT_NEIGHBOURHOODS:
        parent_index = neighbourhoods_data.index[neighbourhoods_data[NEIGHBOURHOOD_COL_KEY] == parent_key].tolist()[0]
        y_predict_parent_value = get_non_negative_value(y_predict[parent_index])
        y_data_parent_value = get_non_negative_value(y_data[parent_index])
        for sub_neighbourhood in PARENT_NEIGHBOURHOODS[parent_key]:
            sub_neighbourhood_index = neighbourhoods_data.index[neighbourhoods_data[NEIGHBOURHOOD_COL_KEY] == sub_neighbourhood].tolist()[0]
            indexes_to_remove.append(sub_neighbourhood_index)
            y_predict_sub_neighbourhood_value = get_non_negative_value(y_predict[sub_neighbourhood_index])
            y_predict_parent_value = y_predict_parent_value + y_predict_sub_neighbourhood_value
            y_data_sub_neighbourhood_value = get_non_negative_value(y_data[sub_neighbourhood_index])
            y_data_parent_value = y_data_parent_value + y_data_sub_neighbourhood_value
        y_predict[parent_index] = y_predict_parent_value
        y_data[parent_index] = y_data_parent_value
    neighbourhoods_data = neighbourhoods_data.drop(neighbourhoods_data.index[indexes_to_remove])
    indexes_to_remove.sort(reverse=True)
    for index in indexes_to_remove:
        y_predict = np.delete(y_predict,index)
        y_data = np.delete(y_data,index)
    return y_data,y_predict,neighbourhoods_data 

<h3>Evaluation <h3>

In [538]:
# parent method for calculating all hotspot performance metrics
def calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods,num_hotspots):
    total_predictions = len(y_predict)
    predicted_hotspots, actual_hotspots = determine_hotspots(y_actual, 
                                                             actual_neighbourhoods, 
                                                             prediction_neighbourhoods,
                                                             num_hotspots)
    num_predictions = len(predicted_hotspots)
    classification_scores = calculate_standard_scores(num_predictions,
                                                      predicted_hotspots,
                                                      actual_hotspots,
                                                      total_predictions)
    misclassification_scores = get_missed_incidents(predicted_hotspots, 
                                                    actual_hotspots, 
                                                    y_actual, 
                                                    y_predict,
                                                    num_predictions,
                                                    actual_neighbourhoods)
    return classification_scores + misclassification_scores

In [539]:
# calculate misclassification severity based on the number of incidents missed
def get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict,num_predictions,actual_neighbourhoods):
    incidents_correct = 0
    incidents_missed = 0
    additional_incidents_caught = 0
    for neighbourhood in predicted_hotspots:
        index = np.where(actual_neighbourhoods == neighbourhood)[0]
        value = y_actual[index]
        if neighbourhood in actual_hotspots:
            incidents_correct += value
        else:
            additional_incidents_caught += value
    i = 0
    i_limit = 0
    lowest_actual = y_actual[num_predictions-1]
    while i < num_predictions:
        if y_actual[i] != lowest_actual:
            neighbourhood = actual_hotspots[i]
            if neighbourhood not in predicted_hotspots:
                index = np.where(actual_neighbourhoods == neighbourhood)[0]
                value = y_actual[index]
                incidents_missed += value
            i += 1
        else:
            i_limit = i
            i = num_predictions
    i = i_limit
    remaining_hotspots_to_find = num_predictions - i
    hotspots_found = 0
    while i < len(actual_hotspots):
        neighbourhood = actual_hotspots[i]
        if neighbourhood in predicted_hotspots:
            hotspots_found += 1
        i += 1
    num_lowest_value_hotspots_missing = remaining_hotspots_to_find - hotspots_found
    lowest_values_missed = lowest_actual * num_lowest_value_hotspots_missing
    incidents_missed += lowest_values_missed
    total_actual_hotspots = incidents_correct + incidents_missed
    net_missed = incidents_missed - additional_incidents_caught
    misclassification_severity = 0.0
    if net_missed != 0:
        misclassification_severity = (net_missed / total_actual_hotspots)[0]
    return [misclassification_severity]

In [540]:
# determine hotspots based on predicted values
def determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots):
    #sorted numpy arrays expected 
    predicted_hotspots = prediction_neighbourhoods[:num_hotspots]
    actual_hotspots = actual_neighbourhoods[:num_hotspots]
    lowest_hotspot_value = y_actual[num_hotspots-1]
    i = num_hotspots
    while i < len(actual_neighbourhoods):
        if y_actual[i] == lowest_hotspot_value:
            np.append(actual_hotspots,actual_neighbourhoods[i])
            i += 1
        else:
            i = len(actual_neighbourhoods)
    return predicted_hotspots, actual_hotspots

In [541]:
# calculate standard accuracy scores for classification
def calculate_standard_scores(num_predictions,predicted_hotspots,actual_hotspots,total_predictions):
    #true positives
    tp = 0
    #false positives
    fp = 0
    for i in range(num_predictions):
        if predicted_hotspots[i] in actual_hotspots:
            tp += 1
        else:
            fp +=1
    #true negatives
    tn = total_predictions-num_predictions-fp
    #false negatives
    fn = fp
    #sensitivity/recall
    sensitivity = 0
    if tp + fn != 0:
        sensitivity = tp / (tp + fn)
    #specificity
    specificity = 1
    if tn + fp !=0:
        specificity = tn / (tn + fp)
    #precision
    precision = 0
    if tp + fp !=0:
        precision = tp / (tp + fp)
    #f1 - incorporates both sensitivity/recall and precision
    f1 = 0
    if precision + sensitivity != 0:
        f1 = round(2 * (precision * sensitivity) / (precision + sensitivity),5)
    #matthews correlation coefficient
    mcc = 1
    if sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn+fn)) != 0:
        mcc = round(((tp * tn) - (fp * fn)) / (sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn+fn))),5)
    return [sensitivity, specificity, precision, f1, mcc]

In [542]:
# get scores for all models
def get_results(num_hotspots, first_date, last_date, df):
    all_scores = {}
    for score_key in SCORE_KEYS:
        all_scores[score_key] = []
    algorithm_names = []
    for model_key in MODELS:
        for feature_key in FEATURES:
            scores = score_algorithm(model_key, feature_key,num_hotspots, first_date, last_date, df)
            average_scores = calculate_averages(scores)
            for score_key in SCORE_KEYS:
                all_scores[score_key].append(average_scores[score_key])
            algorithm_names.append(model_key + "\n" + feature_key)
    return all_scores, algorithm_names

In [543]:
# calculate average scores
def calculate_averages(scores):
    averages = {}
    for score_key in SCORE_KEYS:
        if len(scores[score_key]) == 0:
            averages[score_key] = 0
        else:
            averages[score_key] = sum(scores[score_key]) / len (scores[score_key])
    return averages

In [544]:
# score specific algorithm for all dates in ranges
def score_algorithm(model_key, feature_key,num_hotspots, first_date, last_date, df):
    all_scores = {}
    for score_key in SCORE_KEYS:
        all_scores[score_key] = []
    while first_date <= last_date:
        y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods = get_hotspots(df, 
                                                                                model_key, 
                                                                                feature_key, 
                                                                                str(first_date.strftime("%d/%m/%Y")))
        results = calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods,num_hotspots)
        for i in range(len(SCORE_KEYS)):
            all_scores[SCORE_KEYS[i]].append(results[i])
        first_date += timedelta(days=1)
    return all_scores   

In [545]:
# get best x scores from all scores of all models
def best_x_scores(score_metric,x,algorithms,results,num_hotspots):
    all_labels = get_labels()
    x_labels = []
    best_scores = []
    scores = results[score_metric].copy()
    num_scores = len(scores)
    x = validate_x(num_scores,x)
    i = 0
    while i < x:
        if HIGHEST_IS_BEST[score_metric]:
            index = np.argmax(scores)
        else:
            index = np.argmin(scores)
        x_labels.append(all_labels[index])
        best_scores.append(scores[index])
        all_labels = np.delete(all_labels,index)
        scores = np.delete(scores,index)
        i+=1
    y_select = np.arange(len(best_scores))
    plt.figure(figsize = (15,6))
    ticks = plt.xticks(y_select,x_labels)
    #plt.xticks(rotation=45)
    title = plt.title("Top " + str(x) + " " + score_metric + " scores\nwhen predicting the top " 
              + str(num_hotspots) + " Crime Hotspots")
    x_label = plt.xlabel("Algorithm")
    y_label = plt.ylabel(score_metric + " score")
    high_score = max(best_scores)
    low_score = min(best_scores)
    min_y = low_score - 0.2 * (high_score - low_score) #0.5
    max_y = high_score + 0.2 * (high_score - low_score) #0.5
    plt.ylim(bottom = min_y, top = max_y)
    bar = plt.bar(y_select, best_scores)
    return bar, ticks, title, x_label, y_label

In [546]:
# get chart labels
def get_labels():
    labels = []
    for algorithm in algorithms:
        for sel_method in sel_methods:
            labels.append(algorithm_display_names[algorithms.index(algorithm)] + "\nusing\n" + 
                          feature_select_display_names[np.nonzero(sel_methods == sel_method)[0][0]] + "\ndataset")
    return labels

In [547]:
# return non negative value, no larger than number of results, where necessary
def validate_x(num_scores,x):
    soft_cap = 15
    if num_scores < 0:
        return 0
    if x > num_scores or x < 1:
        if num_scores > soft_cap:
            return soft_cap
        else:
            return num_scores
    return x

In [548]:
# main method to commence evaluation of all models
def evaluate_models(num_hotspots_range, top_x, df):
    first_date = datetime.strptime(df['Date'].iloc[0], '%d/%m/%Y')
    last_date = datetime.strptime(df['Date'].iloc[len(df) - 1], '%d/%m/%Y')
    bars = []
    for i in num_hotspots_range:
        num_hotspots = i
        results, algorithms = get_results(num_hotspots, first_date, last_date, df)
        for metric in SCORE_KEYS:
            bars.append(best_x_scores(metric,top_x,algorithms,results,num_hotspots))
    return bars

In [549]:
SENSITIVITY_KEY = "Sensitivity"
SPECIFICITY_KEY = "Specificity"
PRECISION_KEY = "Precision"
F1_KEY = "F1"
MCC_KEY = "MCC"
SEVERITY_KEY = "Lowest Misclassification Severity"
SCORE_KEYS = [
    SENSITIVITY_KEY,
    SPECIFICITY_KEY,
    PRECISION_KEY,
    F1_KEY,
    MCC_KEY,
    SEVERITY_KEY
]
HIGHEST_IS_BEST = {
    SENSITIVITY_KEY: True,
    SPECIFICITY_KEY: True,
    PRECISION_KEY: True,
    F1_KEY:True,
    MCC_KEY: True,
    SEVERITY_KEY: False
}

num_hotspots_range = [5,10,15,20]
top_x = 10
df = open_file(GUI_FILE_NAME)

In [None]:
bars = evaluate_models(num_hotspots_range,top_x, df)

<h3> In depth analysis required as no clear best algorithm <h3>

In [None]:
# determine scores for all given hotspot ranges
def evaluate_models_in_depth(num_hotspots_range, top_x, df):
    first_date = datetime.strptime(df['Date'].iloc[0], '%d/%m/%Y')
    last_date = datetime.strptime(df['Date'].iloc[len(df) - 1], '%d/%m/%Y')
    all_results = {}
    result_names = {}
    for i in num_hotspots_range:
        num_hotspots = i
        results, algorithms = get_results(num_hotspots, first_date, last_date, df)
        all_results [str(i)] = {}
        result_names [str(i)] = {}
        for metric in SCORE_KEYS:
            all_results[str(i)][str(metric)], result_names[str(i)][str(metric)] = best_x_scores_no_graph(metric,
                                                                                                         top_x,algorithms,
                                                                                                         results,num_hotspots)
    return all_results, result_names

In [None]:
# determine top x algorithms without generating a graph
def best_x_scores_no_graph(score_metric,x,algorithms,results,num_hotspots):
    labels = get_labels()
    all_labels = []
    for i in range(0,len(labels)):
        all_labels.append(labels[i].replace('\n',' '))
    x_labels = []
    best_scores = []
    best_names = []
    scores = results[score_metric].copy()
    num_scores = len(scores)
    x = validate_x(num_scores,x)
    i = 0
    while i < x:
        if HIGHEST_IS_BEST[score_metric]:
            index = np.argmax(scores)
        else:
            index = np.argmin(scores)
        x_labels.append(all_labels[index])
        best_scores.append(scores[index])
        best_names.append(all_labels[index])
        all_labels = np.delete(all_labels,index)
        scores = np.delete(scores,index)
        i+=1
    return best_scores, best_names

In [None]:
# get in depth results for hotspot ranges 1-40
num_hotspots_range = list(range(1,41))
top_x = 10
df = open_file(GUI_FILE_NAME)

In [None]:
results, names = evaluate_models_in_depth(num_hotspots_range,top_x, df)

In [None]:
# get an analysis of the top 5 algorithms for every range
def get_full_analysis(all_results, all_names, num_hotspots_range):
    #define table headings
    num_hotspot_predictions = []
    placings = [[], [], [], [], []]
    best_scores = []
    for i in num_hotspots_range:
        num_hotspots_key = str(i)
        results = all_results[num_hotspots_key]['Lowest Misclassification Severity']
        names = all_names[num_hotspots_key]['Lowest Misclassification Severity']
        num_hotspot_predictions.append(num_hotspots_key)
        best_scores.append(results[0])
        for j in range (0, len(placings)):
            placings[j].append(names[j])
    table = pd.DataFrame()
    table['Number of Hotspots Predicted'] = num_hotspots_range
    table['Best Score'] = [round(num,2) for num in best_scores]
    table['Rank 1'] = placings[0]
    table['Rank 2'] = placings[1]
    table['Rank 3'] = placings[2]
    table['Rank 4'] = placings[3]
    table['Rank 5'] = placings[4]
    return table

In [None]:
# get the score of the given rank
def get_score_by_rank(scores, target_rank):
    rank = 1
    previous_score = scores[0]
    for i in range(0, len(scores)):
        if scores[i] != previous_score:
            rank += 1
            previous_score = scores[i]
        if rank == target_rank:
            return scores[i]
    return 0

In [None]:
# average scores for every hotspot range
def get_average_score_per_num_hotspots(algorithms, results, names, num_hotspots_range):
    average_scores = []
    all_scores = {}
    for algorithm in algorithms:
        all_scores[algorithm] = []
    for i in num_hotspots_range:
        for algorithm in algorithms:
            if algorithm in names[str(i)]['Lowest Misclassification Severity']:
                index = names[str(i)]['Lowest Misclassification Severity'].index(algorithm)
                all_scores[algorithm].append(results[str(i)]['Lowest Misclassification Severity'][index])
    for key in all_scores:
        scores = all_scores[key]
        average_scores.append(sum(scores) / len (scores))
    return average_scores

In [None]:
# determine the number of times an algorithm ranked first, and which hotspot ranges that was for
def get_best_counts(all_results, all_names, num_hotspots_range, rank):
    algorithms = []
    times_first = []
    hotspot_xs = []
    table_arrays = [algorithms, times_first, hotspot_xs]
    for i in num_hotspots_range:
        num_hotspots_key = str(i)
        results = all_results[num_hotspots_key]['Lowest Misclassification Severity']
        names = all_names[num_hotspots_key]['Lowest Misclassification Severity']
        best_score = get_score_by_rank(results, rank)
        for j in range (0, len(results)):
            algorithm = names[j]
            score = results[j]
            if score == best_score:
                if algorithm not in algorithms:
                    algorithms.append(algorithm)
                    times_first.append(0)
                    hotspot_xs.append("")
                index = algorithms.index(algorithm)
                times_first[index] = times_first[index] + 1
                hotspot_xs[index] = hotspot_xs[index] + str(i) + " "
    average_scores = get_average_score_per_num_hotspots(table_arrays[0], all_results, all_names, num_hotspots_range)
    table = pd.DataFrame()
    table['Algorithm'] = table_arrays[0]
    table['Average Score'] = get_average_score_per_num_hotspots(table_arrays[0], all_results, all_names, num_hotspots_range)
    #table = pd.concat([table,average_scores],axis=1,join = "inner")
    table['Times Ranked ' + str(rank)] = [round (num,0) for num in table_arrays[1]]
    table['Hotspot Values'] = table_arrays[2]
    table = table.sort_values(by=['Times Ranked ' + str(rank)], ascending=False)
    return table

In [None]:
full_analysis = get_full_analysis(results, names, num_hotspots_range)
full_analysis

In [None]:
best_counts = get_best_counts(results, names, num_hotspots_range, 1)
best_counts

<h3> Testing starts here <h3>

<h3> Test open_file <h3>

In [None]:
def test_open_file(file_path,expected_length,expected_columns):
    df = open_file(file_path)
    assert len(df) == expected_length, "Data frame length not as expected."
    actual_columns = df.columns    
    assert len(actual_columns) == len(expected_columns), "Actual columns not as expected."
    for i in range(len(expected_columns)):
        assert expected_columns[i] == actual_columns[i], "Expected column " + expected_columns[i] + " but got " + actual_columns[i]
    print("All tests completed successfully")

In [None]:
# get all columns of parent datafile
def get_all_columns():
    return ['Reports 1 day ago','Reports 2 days ago','Reports 3 days ago','Reports 4 days ago','Reports 5 days ago',
                    'Reports 6 days ago','Reports 7 days ago','Reports 14 days ago','Reports 30 days ago',
                    'Reports 365 days ago','Last 7 days reports','Last 14 days reports','Last 28 days reports',
                    'Number of businesses','Businesses 1 day ago','Businesses 2 days ago','Businesses 3 days ago',
                    'Businesses 4 days ago','Businesses 5 days ago','Businesses 6 days ago','Businesses 7 days ago',
                    'Businesses 14 days ago','Businesses 30 days ago','Businesses 365 days ago','Number of closures',
                    'Closures 1 day ago','Closures 2 days ago','Closures 3 days ago','Closures 4 days ago',
                    'Closures 5 days ago','Closures 6 days ago','Closures 7 days ago','Closures 14 days ago',
                    'Closures 30 days ago','Closures 365 days ago','Last 7 days closures','Last 14 days closures',
                    'Last 28 days closures','Number of openings','Openings 1 day ago','Openings 2 days ago',
                    'Openings 3 days ago','Openings 4 days ago','Openings 5 days ago','Openings 6 days ago',
                    'Openings 7 days ago','Openings 14 days ago','Openings 30 days ago','Openings 365 days ago',
                    'Last 7 days openings','Last 14 days openings','Last 28 days openings','Alamo Square','Anza Vista',
                    'Apparel City','Aquatic Park / Ft. Mason','Balboa Terrace','Bayview','Bernal Heights','Bret Harte',
                    'Buena Vista','Candlestick Point SRA','Castro','Cathedral Hill','Cayuga','Central Waterfront','Chinatown',
                    'Civic Center','Clarendon Heights','Cole Valley','Corona Heights','Cow Hollow','Crocker Amazon',
                    'Diamond Heights','Dogpatch','Dolores Heights','Downtown / Union Square','Duboce Triangle','Eureka Valley',
                    'Excelsior','Fairmount','Financial District','Fishermans Wharf','Forest Hill','Forest Knolls','Glen Park',
                    'Golden Gate Heights','Golden Gate Park','Haight Ashbury','Hayes Valley','Holly Park','Hunters Point',
                    'India Basin','Ingleside','Ingleside Terraces','Inner Richmond','Inner Sunset','Japantown','Laguna Honda',
                    'Lake Street','Lakeshore','Laurel Heights / Jordan Park','Lincoln Park / Ft. Miley','Little Hollywood',
                    'Lone Mountain','Lower Haight','Lower Nob Hill','Lower Pacific Heights','Marina','McLaren Park',
                    'Merced Heights','Merced Manor','Midtown Terrace','Mint Hill','Miraloma Park','Mission','Mission Bay',
                    'Mission Dolores','Mission Terrace','Monterey Heights','Mt. Davidson Manor','Nob Hill','Noe Valley',
                    'North Beach','Northern Waterfront','Oceanview','Outer Mission','Outer Richmond','Outer Sunset',
                    'Pacific Heights','Panhandle','Parkmerced','Parkside','Parnassus Heights','Peralta Heights',
                    'Polk Gulch','Portola','Potrero Hill','Presidio Heights','Presidio National Park','Presidio Terrace',
                    'Produce Market','Rincon Hill','Russian Hill','Seacliff','Sherwood Forest','Showplace Square',
                    'Silver Terrace','South Beach','South of Market','St. Francis Wood','St. Marys Park','Stonestown',
                    'Sunnydale','Sunnyside','Sutro Heights','Telegraph Hill','Tenderloin','Treasure Island','Union Street',
                    'University Mound','Upper Market','Visitacion Valley','West Portal','Western Addition',
                    'Westwood Highlands','Westwood Park','Yerba Buena Island','Friday','Saturday','Sunday','Thursday',
                    'Tuesday','Wednesday','Todays Reports','Date','Neighborhood']

In [None]:
# get all neighbourhoods
def get_all_neighbourhoods():
    return ['Alamo Square','Anza Vista', 'Apparel City','Aquatic Park / Ft. Mason','Ashbury Heights','Balboa Terrace','Bayview',
            'Bernal Heights','Bret Harte','Buena Vista','Candlestick Point SRA','Castro','Cathedral Hill','Cayuga',
            'Central Waterfront','Chinatown','Civic Center','Clarendon Heights','Cole Valley','Corona Heights','Cow Hollow',
            'Crocker Amazon','Diamond Heights','Dogpatch','Dolores Heights','Downtown / Union Square','Duboce Triangle',
            'Eureka Valley','Excelsior','Fairmount','Financial District','Fishermans Wharf','Forest Hill','Forest Knolls',
            'Glen Park','Golden Gate Heights','Golden Gate Park','Haight Ashbury','Hayes Valley','Holly Park','Hunters Point',
            'India Basin','Ingleside','Ingleside Terraces','Inner Richmond','Inner Sunset','Japantown','Laguna Honda',
            'Lake Street','Lakeshore','Laurel Heights / Jordan Park','Lincoln Park / Ft. Miley','Little Hollywood',
            'Lone Mountain','Lower Haight','Lower Nob Hill','Lower Pacific Heights','Marina','McLaren Park',
            'Merced Heights','Merced Manor','Midtown Terrace','Mint Hill','Miraloma Park','Mission','Mission Bay',
            'Mission Dolores','Mission Terrace','Monterey Heights','Mt. Davidson Manor','Nob Hill','Noe Valley',
            'North Beach','Northern Waterfront','Oceanview','Outer Mission','Outer Richmond','Outer Sunset',
            'Pacific Heights','Panhandle','Parkmerced','Parkside','Parnassus Heights','Peralta Heights',
            'Polk Gulch','Portola','Potrero Hill','Presidio Heights','Presidio National Park','Presidio Terrace',
            'Produce Market','Rincon Hill','Russian Hill','Seacliff','Sherwood Forest','Showplace Square',
            'Silver Terrace','South Beach','South of Market','St. Francis Wood','St. Marys Park','Stonestown',
            'Sunnydale','Sunnyside','Sutro Heights','Telegraph Hill','Tenderloin','Treasure Island','Union Street',
            'University Mound','Upper Market','Visitacion Valley','West Portal','Western Addition',
            'Westwood Highlands','Westwood Park','Yerba Buena Island']

In [None]:
# get a list of all 1
def get_all_ones():
    all_ones = []
    for i in range(0,117):
        all_ones.append(1)
    return all_ones

In [None]:
#test successful load
expected_columns = get_all_columns()
file_path = "tuning_test_data"
expected_length = 819
test_open_file(file_path,expected_length,expected_columns)

In [None]:
#test failed load
expected_columns = []
file_path = "incorrect_path"
expected_length = 0
test_open_file(file_path,expected_length,expected_columns)

<h3> Test get_hotspots <h3>

In [None]:
def test_get_hotspots(test_model_name,test_feature_name,calendar_date, expected_y_actual, expected_actual_neighbourhoods,
                     expected_y_predict, expected_prediction_neighbourhoods, file_name):
    data = open_file(file_name)
    y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods = get_hotspots(data, 
                                                                                     test_model_name, 
                                                                                     test_feature_name,
                                                                                     calendar_date)
    for i in range(0,len(y_actual)):
        assert y_actual[i] == expected_y_actual[i], "Y actual not as expected."
    for i in range(0,len(expected_y_actual)):
        assert expected_y_actual[i] == y_actual[i], "Y actual not as expected."
        
    for i in range(0,len(y_predict)):
        assert y_predict[i] == expected_y_predict[i], "Y actual not as expected."
    for i in range(0,len(expected_y_predict)):
        assert expected_y_predict[i] == y_predict[i], "Y actual not as expected."   
    for i in range(0,len(actual_neighbourhoods)):
        assert actual_neighbourhoods[i] == expected_actual_neighbourhoods[i], "Actual neighbourhoods not as expected."
    for i in range(0,len(expected_actual_neighbourhoods)):
        assert expected_actual_neighbourhoods[i] == actual_neighbourhoods[i], "Actual neighbourhoods not as expected."    
    for i in range(0,len(prediction_neighbourhoods)):
        assert prediction_neighbourhoods[i] == expected_prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."  
    for i in range(0,len(expected_prediction_neighbourhoods)):
        assert expected_prediction_neighbourhoods[i] == prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."
    print("Tests completed successfully")

In [None]:
file_name = "tuning_test_data"
test_model_name = "Tuning Template Test"
test_model_file_tag = "tuning_template_test_model"
test_feature_file_tag = "arbitrary_name"
test_feature_name = "Arbitrary Feature Name"
test_feature_features = ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']
FEATURES[test_feature_name] = test_feature_features
FEATURE_FILE_TAGS[test_feature_name] = test_feature_file_tag
FEATURE_NAMES_BY_FILE_TAG[test_feature_file_tag] = test_feature_name
MODEL_FILE_TAGS[test_model_name] = test_model_file_tag
calendar_date = "05/01/2021"

In [None]:
expected_y_actual = [5, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
expected_actual_neighbourhoods = ['Downtown / Union Square', 'Bernal Heights', 'Western Addition', 'Portola',
 'Bayview', 'Eureka Valley', 'South Beach', 'Lincoln Park / Ft. Miley',
 'Hunters Point', 'Potrero Hill', 'Presidio Heights', 'Cow Hollow',
 'Russian Hill', 'Cole Valley', 'Mission Terrace', 'Hayes Valley',
 'Stonestown', 'Central Waterfront', 'Northern Waterfront', 'Buena Vista',
 'Visitacion Valley', 'North Beach', 'Forest Hill', 'South of Market',
 'Haight Ashbury', 'Golden Gate Park', 'Yerba Buena Island',
 'Golden Gate Heights', 'Ingleside Terraces', 'Inner Richmond',
 'Inner Sunset', 'Lake Street', 'Lakeshore', 'Ingleside', 'Diamond Heights',
 'Glen Park', 'Forest Knolls', 'Financial District', 'Excelsior',
 'Duboce Triangle', 'Crocker Amazon', 'Corona Heights', 'Clarendon Heights',
 'Civic Center', 'Candlestick Point SRA', 'Balboa Terrace', 'Anza Vista',
 'Laurel Heights / Jordan Park', 'Merced Manor', 'Little Hollywood',
 'St. Francis Wood', 'Panhandle', 'Parkside', 'Presidio National Park',
 'Seacliff', 'Sherwood Forest', 'Silver Terrace', 'Sunnyside', 'Outer Sunset',
 'Telegraph Hill', 'Tenderloin', 'Treasure Island', 'Upper Market',
 'West Portal', 'Westwood Highlands', 'Pacific Heights', 'Outer Richmond',
 'Lone Mountain', 'Mission', 'Lower Pacific Heights', 'Marina',
 'Merced Heights', 'Westwood Park', 'Midtown Terrace', 'Miraloma Park',
 'Mission Bay', 'Outer Mission', 'Mission Dolores', 'Monterey Heights',
 'Mt. Davidson Manor', 'Nob Hill', 'Noe Valley', 'Oceanview', 'Alamo Square']
expected_y_predict = expected_y_actual
expected_prediction_neighbourhoods = expected_actual_neighbourhoods
test_get_hotspots(test_model_name,test_feature_name,calendar_date, expected_y_actual, expected_actual_neighbourhoods,
                     expected_y_predict, expected_prediction_neighbourhoods, file_name)

<h3> Test load_model <h3>

In [None]:
def test_load_model(x_data, y_data, neighbourhoods_data, model_key, features_key):
    y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods = load_model(x_data, 
                                                                                       y_data, 
                                                                                       neighbourhoods_data,
                                                                                       model_key, 
                                                                                       features_key)
    for i in range(0,len(y_actual)):
        assert y_actual[i] == expected_y_actual[i], "Y actual not as expected."
    for i in range(0,len(expected_y_actual)):
        assert expected_y_actual[i] == y_actual[i], "Y actual not as expected."   
    for i in range(0,len(y_predict)):
        assert y_predict[i] == expected_y_predict[i], "Y actual not as expected."
    for i in range(0,len(expected_y_predict)):
        assert expected_y_predict[i] == y_predict[i], "Y actual not as expected."   
    for i in range(0,len(actual_neighbourhoods)):
        assert actual_neighbourhoods[i] == expected_actual_neighbourhoods[i], "Actual neighbourhoods not as expected."
    for i in range(0,len(expected_actual_neighbourhoods)):
        assert expected_actual_neighbourhoods[i] == actual_neighbourhoods[i], "Actual neighbourhoods not as expected."    
    for i in range(0,len(prediction_neighbourhoods)):
        assert prediction_neighbourhoods[i] == expected_prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."  
    for i in range(0,len(expected_prediction_neighbourhoods)):
        assert expected_prediction_neighbourhoods[i] == prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."
    print("Tests completed successfully")

In [None]:
all_ones = get_all_ones()
x_columns = ['Reports 1 day ago','Reports 2 days ago','Reports 3 days ago','Reports 4 days ago',
                      'Reports 5 days ago','Reports 6 days ago','Reports 7 days ago','Reports 14 days ago',
                      'Reports 30 days ago','Reports 365 days ago']
x_data = pd.DataFrame()
for col in x_columns:
    x_data[col] = all_ones
y_data = pd.DataFrame()
y_data['Todays Reports'] = all_ones
neighbourhoods_data = pd.DataFrame()
neighbourhoods_data['Neighborhood'] = get_all_neighbourhoods()
model_key = 'Tuning Template Test'
features_key = 'Arbitrary Feature Name'
test_load_model(x_data, y_data, neighbourhoods_data, model_key, features_key)

<h3> Test make_prediction <h3>

In [None]:
def test_make_prediction(model_path, x_data, y_data, neighbourhoods_data):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods = make_prediction(model,
                                                                                            x_data, 
                                                                                            y_data, 
                                                                                            neighbourhoods_data)
    for i in range(0,len(y_actual)):
        assert y_actual[i] == expected_y_actual[i], "Y actual not as expected."
    for i in range(0,len(expected_y_actual)):
        assert expected_y_actual[i] == y_actual[i], "Y actual not as expected."   
    for i in range(0,len(y_predict)):
        assert y_predict[i] == expected_y_predict[i], "Y actual not as expected."
    for i in range(0,len(expected_y_predict)):
        assert expected_y_predict[i] == y_predict[i], "Y actual not as expected."   
    for i in range(0,len(actual_neighbourhoods)):
        assert actual_neighbourhoods[i] == expected_actual_neighbourhoods[i], "Actual neighbourhoods not as expected."
    for i in range(0,len(expected_actual_neighbourhoods)):
        assert expected_actual_neighbourhoods[i] == actual_neighbourhoods[i], "Actual neighbourhoods not as expected."    
    for i in range(0,len(prediction_neighbourhoods)):
        assert prediction_neighbourhoods[i] == expected_prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."  
    for i in range(0,len(expected_prediction_neighbourhoods)):
        assert expected_prediction_neighbourhoods[i] == prediction_neighbourhoods[i], "Predicted neighbourhoods not as expected."
    print("Tests completed successfully")

In [None]:
model_path = "CrimeGUI/Models/tuning_template_test_model_arbitrary_name"
test_make_prediction(model_path,x_data, y_data, neighbourhoods_data)

<h3> Test get non negative value <h3>

In [None]:
def test_get_non_negative_value(nums_in,expected_outs):
    for i in range(0,len(nums_in)):
        result = get_non_negative_value(nums_in[i])
        assert result == expected_outs[i], "Returned value not as expected"
    print("Tests completed successfully.")

In [None]:
nums_in = [-10,-2,-1,0,1,2,10]
expected_outs = [0,0,0,0,1,2,10]
test_get_non_negative_value(nums_in,expected_outs)

<h3> Test merge sub neighbourhoods <h3>

In [None]:
def test_merge_sub_neighbourhoods(y_data_in,y_predict_in,neighbourhoods_data_in, expected_y_actual, expected_y_predict,
                                 expected_neighbourhoods_data):
    y_actual,y_predict,neighbourhoods_data = merge_sub_neighbourhoods(y_data_in,y_predict_in,neighbourhoods_data_in)
    neighbourhoods_data.reset_index(drop=True, inplace=True)
    for i in range(0,len(y_actual)):
        assert y_actual[i] == expected_y_actual[i], "Y actual not as expected."
    for i in range(0,len(expected_y_actual)):
        assert expected_y_actual[i] == y_actual[i], "Y actual not as expected."   
    for i in range(0,len(y_predict)):
        assert y_predict[i] == expected_y_predict[i], "Y actual not as expected."
    for i in range(0,len(expected_y_predict)):
        assert expected_y_predict[i] == y_predict[i], "Y actual not as expected." 
    for i in range(0,len(neighbourhoods_data)):
        assert neighbourhoods_data['Neighborhood'][i] == expected_neighbourhoods_data[i], "Neighbourhoods not as expected."
    for i in range(0,len(expected_neighbourhoods_data)):
        assert expected_neighbourhoods_data[i] == neighbourhoods_data['Neighborhood'][i], "Neighbourhoods not as expected."   
    print("Tests completed successfully")

In [None]:
y_data_in = pd.DataFrame()
y_data_in['Todays Reports'] = get_all_ones()
y_predict_in = all_ones
neighbourhoods_data_in = pd.DataFrame()
neighbourhoods_data_in['Neighborhood'] = get_all_neighbourhoods()
expected_y_actual = [1, 1, 1, 3, 4, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 5, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 
                     2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 1, 1, 2,
                     2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1]
expected_y_predict = expected_y_actual
expected_neighbourhoods_data = ['Alamo Square', 'Anza Vista', 'Balboa Terrace', 'Bayview', 'Bernal Heights', 'Buena Vista', 
                                'Candlestick Point SRA', 'Central Waterfront', 'Civic Center', 'Clarendon Heights', 
                                'Cole Valley', 'Corona Heights', 'Cow Hollow', 'Crocker Amazon', 'Diamond Heights', 
                                'Downtown / Union Square', 'Duboce Triangle', 'Eureka Valley', 'Excelsior', 
                                'Financial District', 'Forest Hill', 'Forest Knolls', 'Glen Park', 'Golden Gate Heights', 
                                'Golden Gate Park', 'Haight Ashbury', 'Hayes Valley', 'Hunters Point', 'Ingleside', 
                                'Ingleside Terraces', 'Inner Richmond', 'Inner Sunset', 'Lake Street', 'Lakeshore', 
                                'Laurel Heights / Jordan Park', 'Lincoln Park / Ft. Miley', 'Little Hollywood', 
                                'Lone Mountain', 'Lower Pacific Heights', 'Marina', 'Merced Heights', 'Merced Manor', 
                                'Midtown Terrace', 'Miraloma Park', 'Mission', 'Mission Bay', 'Mission Dolores', 
                                'Mission Terrace', 'Monterey Heights', 'Mt. Davidson Manor', 'Nob Hill', 'Noe Valley', 
                                'North Beach', 'Northern Waterfront', 'Oceanview', 'Outer Mission', 'Outer Richmond', 
                                'Outer Sunset', 'Pacific Heights', 'Panhandle', 'Parkside', 'Portola', 'Potrero Hill', 
                                'Presidio Heights', 'Presidio National Park', 'Russian Hill', 'Seacliff', 'Sherwood Forest', 
                                'Silver Terrace', 'South Beach', 'South of Market', 'St. Francis Wood', 'Stonestown', 
                                'Sunnyside', 'Telegraph Hill', 'Tenderloin', 'Treasure Island', 'Upper Market', 
                                'Visitacion Valley', 'West Portal', 'Western Addition', 'Westwood Highlands', 'Westwood Park', 
                                'Yerba Buena Island']
test_merge_sub_neighbourhoods(y_data_in,y_predict_in,neighbourhoods_data_in, expected_y_actual, expected_y_predict,
                                 expected_neighbourhoods_data)

<h3> Test calculate accuracy <h3>

In [None]:
def test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods, expected_scores
                            ,num_hotspots):
    
    result = calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods,num_hotspots)
    #returns [sensitivity, specificity, precision, f1, mcc, misclassification_severity]
    assert result [0] == expected_scores [0], "Sensitivity not as expected."
    assert result [1] == expected_scores [1], "Specificity not as expected."
    assert result [2] == expected_scores [2], "Precision not as expected."
    assert result [3] == expected_scores [3], "F1 not as expected."
    assert result [4] == expected_scores [4], "MCC not as expected."
    assert result [5] == expected_scores [5], "Misclassification severity not as expected."
    print ("All tests completed successfully")

In [None]:
def get_x_neighbourhoods(x, reverse):
    neighbourhoods = []
    for i in range(0,x):
        neighbourhoods.append('Neighbourhood ' + str(i))
    if reverse:
        neighbourhoods.reverse()
    return neighbourhoods

In [None]:
data = open_file(file_name)
test_model_name = "Tuning Template Test"
test_feature_name = "Arbitrary Feature Name"
calendar_date = "05/01/2021"
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods = get_hotspots(data, 
                                                                                     test_model_name, 
                                                                                     test_feature_name,
                                                                                     calendar_date)
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, prediction_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(one_to_ten))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(one_to_ten))
y_predict = np.ndarray((10,),buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(one_to_ten))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(one_to_ten))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(one_to_ten))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 10
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [0.8888888888888888,0,0.8888888888888888,0.88889,-0.11111,0.16666666666666666]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [0.8888888888888888,0,0.8888888888888888,0.88889,-0.11111,0.16666666666666666]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = False), dtype=object)
expected_scores = [0.8888888888888888,0,0.8888888888888888,0.88889,-0.11111,0.16666666666666666]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,),buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = False), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [0.8888888888888888,0,0.8888888888888888,0.88889,-0.11111,0.16666666666666666]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
one_to_ten = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
y_actual = np.ndarray((10,), buffer = np.array(ten_to_one))
y_predict = np.ndarray((10,), buffer = np.array(one_to_ten))
actual_neighbourhoods = np.array(get_x_neighbourhoods(len(y_actual), reverse = True), dtype=object)
predicted_neighbourhoods = np.array(get_x_neighbourhoods(len(y_predict), reverse = True), dtype=object)
expected_scores = [1,1,1,1,1,0]
num_hotspots = 9
test_calculate_accuracy(y_actual, actual_neighbourhoods, y_predict, predicted_neighbourhoods, expected_scores, num_hotspots)

<h3> Test determine hotspots <h3>

In [None]:
def test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result):
    result = determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots)[0]
    for i in range (0,len(expected_result)):
        assert expected_result[i] in result, "Hotspots not as expected"
    print("All tests completed successfully")

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
num_hotspots = 10
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
num_hotspots = 10
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
expected_result = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
num_hotspots = 10
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
expected_result = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
num_hotspots = 10
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(9, reverse = False), dtype=object)
num_hotspots = 9
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(9, reverse = False), dtype=object)
num_hotspots = 9
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
expected_result = np.array(['Neighbourhood 9','Neighbourhood 8', 'Neighbourhood 7', 'Neighbourhood 6', 'Neighbourhood 5' ,
                   'Neighbourhood 4', 'Neighbourhood 3', 'Neighbourhood 2', 'Neighbourhood 1'], dtype=object)
num_hotspots = 9
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
ten_to_one = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(ten_to_one))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
expected_result = np.array(['Neighbourhood 9','Neighbourhood 8', 'Neighbourhood 7', 'Neighbourhood 6', 'Neighbourhood 5' ,
                   'Neighbourhood 4', 'Neighbourhood 3', 'Neighbourhood 2', 'Neighbourhood 1'], dtype=object)
num_hotspots = 9
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
five_to_zero = [5.0,4.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0]
y_actual = np.ndarray((10,),buffer = np.array(five_to_zero))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(5, reverse = False), dtype=object)
num_hotspots = 5
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
five_to_zero = [5.0,4.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0]
y_actual = np.ndarray((10,),buffer = np.array(five_to_zero))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(5, reverse = False), dtype=object)
num_hotspots = 5
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
five_to_zero = [5.0,4.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0]
y_actual = np.ndarray((10,),buffer = np.array(five_to_zero))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = True), dtype=object)
expected_result = np.array(['Neighbourhood 9', 'Neighbourhood 8', 'Neighbourhood 7', 'Neighbourhood 6',
                            'Neighbourhood 5'], dtype=object)
num_hotspots = 5
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

In [None]:
five_to_zero = [5.0,4.0,3.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0]
y_actual = np.ndarray((10,),buffer = np.array(five_to_zero))
actual_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
prediction_neighbourhoods = np.array(get_x_neighbourhoods(10, reverse = False), dtype=object)
expected_result = np.array(get_x_neighbourhoods(5, reverse = True), dtype=object)
num_hotspots = 5
test_determine_hotspots(y_actual, actual_neighbourhoods, prediction_neighbourhoods,num_hotspots,expected_result)

<h3> Test calculate_standard_scores <h3>

In [None]:
def test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected):
    num_predictions = len(predicted_hotspots)
    expected_result = np.ndarray((5,),buffer = np.array(expected))
    result = calculate_standard_scores(num_predictions, predicted_hotspots, actual_hotspots, total_predictions)
    assert len(expected_result) == len(result), "Scores not as expected"
    for i in range (0,len(expected_result)):
        assert expected_result[i] == result[i], "Scores not as expected"
    print("All tests completed successfully")

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
total_predictions = 5
expected_result = [1.0,1.0,1.0,1.0,1.0]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
total_predictions = 10
expected_result = [1.0,1.0,1.0,1.0,1.0]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 6']
total_predictions = 5
expected_result = [0.8,1,0.8,0.8,1.0]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 6']
total_predictions = 10
expected_result = [0.8,0.8,0.8,0.8,0.6]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 6']
total_predictions = 15
expected_result = [0.8,0.9,0.8,0.8,0.7]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10']
total_predictions = 5
expected_result = [0.0,1.0,0.0,0.0,1.0]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10']
total_predictions = 10
expected_result = [0.0,0.0,0.0,0.0,-1.0]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

In [None]:
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10']
total_predictions = 15
expected_result = [0.0,0.5,0.0,0.0,-0.5]
test_calculate_standard_scores(predicted_hotspots, actual_hotspots, total_predictions,expected_result)

<h3> Test get_missed_incidents <h3>

In [None]:
def test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result):
    num_predictions = len(predicted_hotspots)
    
    
    actual_result = get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, num_predictions, 
                                  actual_neighbourhoods)

    assert actual_result[0] == expected_result[0], "Result not as expected."
    print("All tests completed successfully.")

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']


expected_result = [0]
test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 6']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspot_incidents = 10+9+8+7+6
predicted_hotspot_incidents = 10+9+8+7+5
missed = actual_hotspot_incidents - predicted_hotspot_incidents
expected_result = [missed/actual_hotspot_incidents]

test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 7']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspot_incidents = 10+9+8+7+6
predicted_hotspot_incidents = 10+9+8+7+4
missed = actual_hotspot_incidents - predicted_hotspot_incidents
expected_result = [missed/actual_hotspot_incidents]

test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspot_incidents = 10+9+8+7+6
predicted_hotspot_incidents = 1+2+3+4+5
missed = actual_hotspot_incidents - predicted_hotspot_incidents
expected_result = [missed/actual_hotspot_incidents]

test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 10','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspot_incidents = 10+9+8+7+6
predicted_hotspot_incidents = 1+9+8+7+6
missed = actual_hotspot_incidents - predicted_hotspot_incidents
expected_result = [missed/actual_hotspot_incidents]

test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

In [None]:
actuals = [10.0,9.0,8.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0]
y_actual = np.ndarray((10,),buffer = np.array(actuals))
predicts = [10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0]
y_predict = np.ndarray((10,),buffer = np.array(predicts))
actual_neighbourhoods = np.array(['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5',
                                  'Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10'], 
                                 dtype=object)
predicted_hotspots = ['Neighbourhood 6','Neighbourhood 7','Neighbourhood 8','Neighbourhood 9','Neighbourhood 10']
actual_hotspots = ['Neighbourhood 1','Neighbourhood 2','Neighbourhood 3','Neighbourhood 4','Neighbourhood 5']
actual_hotspot_incidents = 10+9+8+7+6
predicted_hotspot_incidents = 0+0+0+0+0
missed = actual_hotspot_incidents - predicted_hotspot_incidents
expected_result = [missed/actual_hotspot_incidents]

test_get_missed_incidents(predicted_hotspots, actual_hotspots, y_actual, y_predict, actual_neighbourhoods,expected_result)

<h3> Test score_algorithm <h3>

In [None]:
def test_score_algorithm(model_key, feature_key, num_hotspots, first_date, last_date, df, expected):
    result = score_algorithm(model_key, feature_key, num_hotspots, first_date, last_date, df)
    for key in expected:
        assert key in result, key + " key expected but not found."
        expected_key_sore = expected[key]
        actual_key_score = result[key]
        assert len(expected_key_sore) == len(actual_key_score), "Scores for " + key + " key are not of expected length."
        for i in range (0, len(expected_key_sore)):
            assert expected_key_sore[i] == actual_key_score[i], "Scores for " + key + " not as expected"
    print("All tests completed successfully.")

In [None]:
file_name = "tuning_test_data"
df = open_file(file_name)
first_date = datetime.strptime(df['Date'].iloc[0], '%d/%m/%Y')
last_date = datetime.strptime(df['Date'].iloc[len(df) - 1], '%d/%m/%Y')
model_key = "tuning_template_test_model"
feature_key = "arbitrary_name"
FEATURES[feature_key] = FEATURES[F_REGRESSION_NAME]
MODEL_FILE_TAGS[model_key] = model_key
FEATURE_FILE_TAGS[feature_key] = feature_key
num_hotspots = 5
expected = {'Sensitivity': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'Specificity': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'Precision': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'F1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'MCC': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'Lowest Misclassification Severity': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
test_score_algorithm(model_key, feature_key, num_hotspots, first_date, last_date, df, expected)

<h3> Test calculate_averages <h3>

In [None]:
def test_calulate_averages(scores, expected_averages):
    actual_averages = calculate_averages(scores)
    for key in expected_averages:
        assert key in actual_averages, key + " key expected but not found."
        expected_key_sore = expected_averages[key]
        actual_key_score = actual_averages[key]
        assert expected_key_sore == actual_key_score, "Scores for " + key + " not as expected"
    print("All tests completed successfully.")

In [None]:
scores = {'Sensitivity': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
            'Specificity': [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], 
            'Precision': [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], 
            'F1': [4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0], 
            'MCC': [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0], 
            'Lowest Misclassification Severity': [6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0]}
expected_averages = {'Sensitivity': 1.0, 
                     'Specificity': 2.0, 
                     'Precision': 3.0, 
                     'F1': 4.0, 
                     'MCC': 5.0, 
                     'Lowest Misclassification Severity': 6.0}
test_calulate_averages(scores, expected_averages)

In [None]:
scores = {'Sensitivity': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], 
            'Specificity': [1, 2, 3], 
            'Precision': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], 
            'F1': [100000000, 10000000, 1000000], 
            'MCC': [0.5,1.5], 
            'Lowest Misclassification Severity': [1,1.5,2,2.5,3,3.5]}
expected_averages = {'Sensitivity': 4.0, 
                     'Specificity': 2, 
                     'Precision': 0.55, 
                     'F1': 37000000, 
                     'MCC': 1.0, 
                     'Lowest Misclassification Severity': 2.25}
test_calulate_averages(scores, expected_averages)

In [None]:
scores = {'Sensitivity': [0], 
            'Specificity': [1], 
            'Precision': [0,0,0,0], 
            'F1': [-1,0,1], 
            'MCC': [-1,0,4], 
            'Lowest Misclassification Severity': []}
expected_averages = {'Sensitivity': 0, 
                     'Specificity': 1, 
                     'Precision': 0, 
                     'F1': 0, 
                     'MCC': 1, 
                     'Lowest Misclassification Severity': 0}
test_calulate_averages(scores, expected_averages)

<h3> Test get_results <h3>

In [None]:
def test_get_results(num_hotspots, file_name, expected_scores, expected_names):
    df = open_file(file_name)
    first_date = datetime.strptime(df['Date'].iloc[0], '%d/%m/%Y')
    last_date = datetime.strptime(df['Date'].iloc[len(df) - 1], '%d/%m/%Y')
    all_scores, algorithm_names = get_results(num_hotspots, first_date, last_date, df)
    assert len(expected_names) == len(algorithm_names), "Algorithm names length not as expected"
    for i in range(0, len(expected_names)):
        assert expected_names[i] == algorithm_names[i], "Algorithm names not as expected"
    for key in all_scores:
        assert key in SCORE_KEYS, "All scores keys not as expected"
        if len(expected_scores[key]) > 0:
            assert len(all_scores[key]) == len(expected_scores[key]), "Score length not as expected"
            for i in range(0,len(expected_scores[key])):
                assert expected_scores[key][i] == all_scores[key][i], "Score for " + key + " not as expected"
    print("All tests completed successfully")    

In [None]:
file_name = "tuning_test_data"
model_key = "tuning_template_test_model"
feature_key = "arbitrary_name"
MODEL_FILE_TAGS[model_key] = model_key
FEATURE_FILE_TAGS[feature_key] = feature_key
MODELS = {model_key : model_key}
FEATURES = {feature_key : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']}
num_hotspots = 5
expected_scores = {'Sensitivity': [1.0], 
                   'Specificity': [1.0], 
                   'Precision': [1.0], 
                   'F1': [1.0], 
                   'MCC': [1.0], 
                   'Lowest Misclassification Severity': [0.0]}
expected_names = ['tuning_template_test_model\narbitrary_name']
test_get_results(num_hotspots, file_name, expected_scores, expected_names)

In [None]:
FEATURES = {
    F_REGRESSION_NAME : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago'],
    CHI2_NAME : ['South of Market', 'Mission', 'Tenderloin', 'Number of businesses', 
               'Downtown / Union Square', 'Civic Center', 'Reports 365 days ago',
               'Reports 1 day ago','Reports 2 days ago','Reports 14 days ago'],
    ADABOOST_NAME : ['Reports 365 days ago', 'Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
               'Reports 2 days ago', 'Reports 7 days ago', 'Number of businesses',
               'Reports 4 days ago','Reports 5 days ago','Closures 365 days ago'],
    EQUAL_DATA_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Reports 1 day ago',
                      'Reports 2 days ago', 'Reports 4 days ago', 'Reports 30 days ago', 'Reports 7 days ago'],
    ALL_BUS_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Number of openings',
                   'Openings 4 days ago','Openings 1 day ago', 'Openings 7 days ago', 'Openings 2 days ago']
    }
MODELS = [ANN_NAME,
          DECISION_TREE_NAME,
          ELASTIC_NET_NAME,
          LASSO_NAME,
          LINERAR_REGRESSION_NAME,
          RANDOM_FOREST_NAME,
          RIDGE_REGRESSION_NAME,
          SVM_NAME]
file_name = "tuning_test_data"
num_hotspots = 5
expected_scores = {'Sensitivity': [], 
                   'Specificity': [], 
                   'Precision': [], 
                   'F1': [], 
                   'MCC': [], 
                   'Lowest Misclassification Severity': []}
expected_names = ['Multi-Layer\nPerceptron\nF-Regression', 'Multi-Layer\nPerceptron\nChi-Squared', 
                  'Multi-Layer\nPerceptron\nAdaBoost', 'Multi-Layer\nPerceptron\nEqual Selection', 
                  'Multi-Layer\nPerceptron\nAll Business', 'Decision Tree\nF-Regression', 
                  'Decision Tree\nChi-Squared', 'Decision Tree\nAdaBoost', 'Decision Tree\nEqual Selection', 
                  'Decision Tree\nAll Business', 'Elastic Net\nF-Regression', 'Elastic Net\nChi-Squared', 
                  'Elastic Net\nAdaBoost', 'Elastic Net\nEqual Selection', 'Elastic Net\nAll Business', 
                  'Lasso\nF-Regression', 'Lasso\nChi-Squared', 'Lasso\nAdaBoost', 'Lasso\nEqual Selection', 
                  'Lasso\nAll Business', 'Linear \nRegression\nF-Regression', 'Linear \nRegression\nChi-Squared', 
                  'Linear \nRegression\nAdaBoost', 'Linear \nRegression\nEqual Selection', 
                  'Linear \nRegression\nAll Business', 'Random \nForest\nF-Regression', 'Random \nForest\nChi-Squared', 
                  'Random \nForest\nAdaBoost', 'Random \nForest\nEqual Selection', 'Random \nForest\nAll Business', 
                  'Ridge \nRegression\nF-Regression', 'Ridge \nRegression\nChi-Squared', 'Ridge \nRegression\nAdaBoost', 
                  'Ridge \nRegression\nEqual Selection', 'Ridge \nRegression\nAll Business', 'SVM\nF-Regression', 
                  'SVM\nChi-Squared', 'SVM\nAdaBoost', 'SVM\nEqual Selection', 'SVM\nAll Business']
test_get_results(num_hotspots, file_name, expected_scores, expected_names)

<h3> Test get_best_x_scores <h3>

In [None]:
def setup_lists():
    algorithms = ['multi_layer_perceptron',
                  'decision_tree',
                  'elastic_net',
                  'lasso',
                  'linear_regression',
                  'random_forest',
                  'ridge_regression',
                  'svm']
    algorithm_display_names = ['Multi\nLayer\nPerceptron',
                               'Decision\nTree',
                               'Elastic Net',
                               'Lasso',
                               'Linear\nRegression',
                               'Random\nForest',
                               'Ridge\nRegression',
                               'SVM']
    feature_select_display_names = ['F Regression',
                               'Chi2',
                               'Adaboost',
                               'Equal crime\nand business',
                               'All Business']
    highest_is_best = {"r2":True,
                  "mse":False,
                  "mae":False}
    score_metrics = ["r2","mse","mae"]
    return algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics

In [None]:
def test_best_x_scores(bar, expected_heights, x_label, y_label, expected_y, x_ticks, expected_ticks, expected_title, 
                       title, expected_x):
    
    assert title.get_text() == expected_title, "Title not as expected."
    rectangles = bar.get_children()
    for i in range(len(rectangles)):
        assert rectangles[i].get_height() == expected_heights[i],"Height of a rectangle not as expected."
    assert x_label.get_text() == "Algorithm", "x label not as expected."
    assert y_label.get_text() == expected_y, "y label not as expected."
    sub_headings = x_ticks[1]
    assert sub_headings, "Sub-headings not as expected"
    assert len(sub_headings) == expected_x and len(rectangles) == expected_x, "Number of columns not as expected."
    for i in range(0,len(sub_headings)):
        assert sub_headings[i].get_text() == expected_ticks[i], "Sub-headings not as expected"
    print("All bar chart tests completed successfully.")

In [None]:
expected_r2_rectangle_heights = [1]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1]}
metric = 'Sensitivity'
x = 10
num_hotspots = 5
expected_y = 'Sensitivity score'
expected_x = 1
expected_title = "Top 1 Sensitivity scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset']
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [2,1]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [],
         'Precision':[],
         'F1':[],
         'MCC':[],
         'Lowest Misclassification Severity':[]}
metric = 'Sensitivity'
x = 10
num_hotspots = 5
expected_y = 'Sensitivity score'
expected_x = 2
expected_title = "Top 2 Sensitivity scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nChi2\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset']
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [3,2,1]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [2,3,1],
         'Precision':[],
         'F1':[],
         'MCC':[],
         'Lowest Misclassification Severity':[]}
metric = 'Specificity'
x = 10
num_hotspots = 5
expected_y = 'Specificity score'
expected_x = 3
expected_title = "Top 3 Specificity scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nChi2\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                 'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset']
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [6,5,4,3,2]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [2,3,1],
         'Precision':[2,4,6,1,3,5],
         'F1':[],
         'MCC':[],
         'Lowest Misclassification Severity':[]}
metric = 'Precision'
x = 5
num_hotspots = 5
expected_y = 'Precision score'
expected_x = 5
expected_title = "Top 5 Precision scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                  'Decision\nTree\nusing\nF Regression\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nChi2\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nAll Business\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                 ]
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [15,14,13,12,11,10]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [2,3,1],
         'Precision':[2,4,6,1,3,5],
         'F1':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
         'MCC':[],
         'Lowest Misclassification Severity':[]}
metric = 'F1'
x = 6
num_hotspots = 5
expected_y = 'F1 score'
expected_x = 6
expected_title = "Top 6 F1 scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Elastic Net\nusing\nAll Business\ndataset',
                  'Elastic Net\nusing\nEqual crime\nand business\ndataset',
                  'Elastic Net\nusing\nAdaboost\ndataset',
                  'Elastic Net\nusing\nChi2\ndataset',
                  'Elastic Net\nusing\nF Regression\ndataset',
                  'Decision\nTree\nusing\nAll Business\ndataset'
                 ]
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [3,2,1]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [1],
         'Precision':[1],
         'F1':[1],
         'MCC':[2,3,1],
         'Lowest Misclassification Severity':[]}
metric = 'MCC'
x = 10
num_hotspots = 5
expected_y = 'MCC score'
expected_x = 3
expected_title = "Top 3 MCC scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nChi2\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                 'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset']
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

In [None]:
expected_r2_rectangle_heights = [1,2,3]
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [1],
         'Precision':[1],
         'F1':[1],
         'MCC':[1],
         'Lowest Misclassification Severity':[2,1,3]}
metric = 'Lowest Misclassification Severity'
x = 10
num_hotspots = 5
expected_y = 'Lowest Misclassification Severity score'
expected_x = 3
expected_title = "Top 3 Lowest Misclassification Severity scores\nwhen predicting the top 5 Crime Hotspots"
expected_ticks = ['Multi\nLayer\nPerceptron\nusing\nChi2\ndataset',
                  'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                 'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset']
bar, ticks, title, x_label, y_label = best_x_scores(metric,x,algorithms, scores, num_hotspots)
test_best_x_scores(bar, expected_r2_rectangle_heights, x_label, y_label, expected_y, ticks, expected_ticks, expected_title, 
               title, expected_x)

<h3> Test get_labels <h3>

In [None]:
def test_get_labels():
    expected_labels = ['Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset', 
                       'Multi\nLayer\nPerceptron\nusing\nChi2\ndataset', 
                       'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset', 
                       'Multi\nLayer\nPerceptron\nusing\nEqual crime\nand business\ndataset',
                       'Multi\nLayer\nPerceptron\nusing\nAll Business\ndataset', 
                       'Decision\nTree\nusing\nF Regression\ndataset', 
                       'Decision\nTree\nusing\nChi2\ndataset', 
                       'Decision\nTree\nusing\nAdaboost\ndataset', 
                       'Decision\nTree\nusing\nEqual crime\nand business\ndataset',
                       'Decision\nTree\nusing\nAll Business\ndataset', 
                       'Elastic Net\nusing\nF Regression\ndataset', 
                       'Elastic Net\nusing\nChi2\ndataset', 
                       'Elastic Net\nusing\nAdaboost\ndataset', 
                       'Elastic Net\nusing\nEqual crime\nand business\ndataset', 
                       'Elastic Net\nusing\nAll Business\ndataset', 
                       'Lasso\nusing\nF Regression\ndataset', 
                       'Lasso\nusing\nChi2\ndataset', 
                       'Lasso\nusing\nAdaboost\ndataset', 
                       'Lasso\nusing\nEqual crime\nand business\ndataset', 
                       'Lasso\nusing\nAll Business\ndataset', 
                       'Linear\nRegression\nusing\nF Regression\ndataset', 
                       'Linear\nRegression\nusing\nChi2\ndataset', 
                       'Linear\nRegression\nusing\nAdaboost\ndataset', 
                       'Linear\nRegression\nusing\nEqual crime\nand business\ndataset', 
                       'Linear\nRegression\nusing\nAll Business\ndataset', 
                       'Random\nForest\nusing\nF Regression\ndataset', 
                       'Random\nForest\nusing\nChi2\ndataset', 
                       'Random\nForest\nusing\nAdaboost\ndataset', 
                       'Random\nForest\nusing\nEqual crime\nand business\ndataset', 
                       'Random\nForest\nusing\nAll Business\ndataset', 
                       'Ridge\nRegression\nusing\nF Regression\ndataset', 
                       'Ridge\nRegression\nusing\nChi2\ndataset', 
                       'Ridge\nRegression\nusing\nAdaboost\ndataset', 
                       'Ridge\nRegression\nusing\nEqual crime\nand business\ndataset', 
                       'Ridge\nRegression\nusing\nAll Business\ndataset', 
                       'SVM\nusing\nF Regression\ndataset', 
                       'SVM\nusing\nChi2\ndataset', 
                       'SVM\nusing\nAdaboost\ndataset', 
                       'SVM\nusing\nEqual crime\nand business\ndataset', 
                       'SVM\nusing\nAll Business\ndataset']
    actual_labels = get_labels()
    assert expected_labels == actual_labels, "Labels not as expected."
    print("All tests completed successfully")

In [None]:
file = open("Selection Methods","rb")
sel_methods = np.load(file)
test_get_labels()

<h3> Test validate_x <h3>

In [None]:
def test_validate_x(num_scores, x, expected_x):
    x = validate_x(num_scores, x)
    assert x == expected_x, "x not as expected; x is " + str(x) + ", but expected " + str(expected_x) + ". i: " + str(i)

In [None]:
num_scores = [-1,-1,-1,-1,-1,-1,-1, 
              0,0,0,0,0,0,0,        
              1,1,1,1,1,1,1,        
              2,2,2,2,2,2,2,        
              14,14,14,14,14,14,14, 
              15,15,15,15,15,15,15, 
              16,16,16,16,16,16,16] 
x = [-1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,  
     -1,0,1,2,14,15,16,]
expected = [0,0,0,0,0,0,0,
           0,0,0,0,0,0,0,
           1,1,1,1,1,1,1,
           2,2,1,2,2,2,2,
           14,14,1,2,14,14,14,
           15,15,1,2,14,15,15,
           15,15,1,2,14,15,16]
for i in range(0, len(num_scores)):
    test_validate_x(num_scores[i],x[i],expected[i])
print("All tests completed successfully.")

<h3> Test evaluate_models <h3>

In [None]:
def test_evaluate_models(num_hotspots_range, top_x, file_name, expected_cols_in_bars, expected_scores,expected_num_bars,
                        expected_x_ticks, expected_titles,expected_x_label,expected_y_labels):
    df = open_file(file_name)
    bars = evaluate_models(num_hotspots_range, top_x, df)
    assert len(bars) == expected_num_bars, "Different number of bars than expected."
    for i in range(0,len(bars)):
        rectangles = bars[i][0].get_children()
        assert len(rectangles) == expected_cols_in_bars, "Number of columns not as expected."
        if len(expected_scores) > 0:
            for j in range(0,len(rectangles)):
                rectangle = rectangles[j]
                assert rectangle.get_height() == expected_scores[i],"Height of a rectangle not as expected."
        x_tick_text = bars[i][1][0][0].get_children()[3].get_text()
        assert x_tick_text == expected_x_ticks[i], "X ticks not as expected"
        title = bars[i][2].get_text()
        assert title == expected_titles[i], "Titles not as expected"
        x_label = bars[i][3].get_text()
        assert x_label == expected_x_label, "x label not as expected"
        y_label = bars[i][4].get_text()
        assert y_label == expected_y_labels[i], "y label not as expected"
    print("All tests completed successfully.")

In [None]:
file_name = "tuning_test_data"
model_key = "tuning_template_test_model"
feature_key = "arbitrary_name"
MODEL_FILE_TAGS[model_key] = model_key
FEATURE_FILE_TAGS[feature_key] = feature_key
MODELS = {model_key : model_key}
FEATURES = {feature_key : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']}
num_hotspots = 5
num_hotspots_range = [5]
top_x = 5
expected_cols_in_bars = 1
expected_num_bars = 6
expected_scores = [1,1,1,1,1,0]
expected_x_ticks = ['Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                   'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                   'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                   'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                   'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset',
                   'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset']
expected_titles = ['Top 1 Sensitivity scores\nwhen predicting the top 5 Crime Hotspots',
                  'Top 1 Specificity scores\nwhen predicting the top 5 Crime Hotspots',
                  'Top 1 Precision scores\nwhen predicting the top 5 Crime Hotspots',
                  'Top 1 F1 scores\nwhen predicting the top 5 Crime Hotspots',
                  'Top 1 MCC scores\nwhen predicting the top 5 Crime Hotspots',
                  'Top 1 Lowest Misclassification Severity scores\nwhen predicting the top 5 Crime Hotspots']
expected_x_label = "Algorithm"
expected_y_labels = ['Sensitivity score', 'Specificity score', 'Precision score', 'F1 score',
                    'MCC score', 'Lowest Misclassification Severity score']
test_evaluate_models(num_hotspots_range, top_x, file_name, expected_cols_in_bars, expected_scores,expected_num_bars,
                    expected_x_ticks,expected_titles,expected_x_label, expected_y_labels)

In [None]:
MODELS = [ANN_NAME,
          DECISION_TREE_NAME,
          ELASTIC_NET_NAME,
          LASSO_NAME,
          LINERAR_REGRESSION_NAME,
          RANDOM_FOREST_NAME,
          RIDGE_REGRESSION_NAME,
          SVM_NAME]
FEATURES = {
    F_REGRESSION_NAME : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago'],
    CHI2_NAME : ['South of Market', 'Mission', 'Tenderloin', 'Number of businesses', 
               'Downtown / Union Square', 'Civic Center', 'Reports 365 days ago',
               'Reports 1 day ago','Reports 2 days ago','Reports 14 days ago'],
    ADABOOST_NAME : ['Reports 365 days ago', 'Reports 1 day ago', 'Reports 14 days ago', 'Reports 3 days ago', 
               'Reports 2 days ago', 'Reports 7 days ago', 'Number of businesses',
               'Reports 4 days ago','Reports 5 days ago','Closures 365 days ago'],
    EQUAL_DATA_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Reports 1 day ago',
                      'Reports 2 days ago', 'Reports 4 days ago', 'Reports 30 days ago', 'Reports 7 days ago'],
    ALL_BUS_NAME : ['Number of businesses', 'Last 28 days closures', 'Last 7 days openings',
                          'Last 14 days closures', 'Last 7 days closures','Number of openings',
                   'Openings 4 days ago','Openings 1 day ago', 'Openings 7 days ago', 'Openings 2 days ago']
    }

In [None]:
num_hotspots = 5
num_hotspots_range = [10]
top_x = 5
expected_cols_in_bars = 5
expected_num_bars = 6
expected_scores = [1,1,1,1,1,0]
expected_x_ticks = ['Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                    'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                    'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                    'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                    'Multi\nLayer\nPerceptron\nusing\nAdaboost\ndataset',
                    'Multi\nLayer\nPerceptron\nusing\nF Regression\ndataset']
expected_titles = ['Top 5 Sensitivity scores\nwhen predicting the top 10 Crime Hotspots',
                  'Top 5 Specificity scores\nwhen predicting the top 10 Crime Hotspots',
                  'Top 5 Precision scores\nwhen predicting the top 10 Crime Hotspots',
                  'Top 5 F1 scores\nwhen predicting the top 10 Crime Hotspots',
                  'Top 5 MCC scores\nwhen predicting the top 10 Crime Hotspots',
                  'Top 5 Lowest Misclassification Severity scores\nwhen predicting the top 10 Crime Hotspots']
expected_x_label = "Algorithm"
expected_y_labels = ['Sensitivity score', 'Specificity score', 'Precision score', 'F1 score',
                    'MCC score', 'Lowest Misclassification Severity score']
test_evaluate_models(num_hotspots_range, top_x, file_name, expected_cols_in_bars, expected_scores,expected_num_bars,
                    expected_x_ticks,expected_titles,expected_x_label, expected_y_labels)

<h3> Test evaluate models in depth <h3>

In [None]:
def test_evaluate_models_in_depth(num_hotspots_range, top_x, file_name, expected_scores, expected_names):
    df = open_file(file_name)
    scores, names = evaluate_models_in_depth(num_hotspots_range, top_x, df)
    assert scores == expected_scores, "Scores not as expected."
    for key in scores:
        assert key in expected_scores, "Scores not as expected."
        assert scores[key] == expected_scores[key], "Scores not as expected."
    assert names == expected_names, "Names not as expected."
    for key in names:
        assert key in expected_names, "Names not as expected."
        assert names[key] == expected_names[key], "Names not as expected."
    print("All tests completed successfully.")

In [None]:
file_name = "tuning_test_data"
model_key = "tuning_template_test_model"
feature_key = "arbitrary_name"
MODEL_FILE_TAGS[model_key] = model_key
FEATURE_FILE_TAGS[feature_key] = feature_key
MODELS = {model_key : model_key}
FEATURES = {feature_key : ['Reports 1 day ago', 'Reports 2 days ago', 'Reports 3 days ago',
                       'Reports 4 days ago', 'Reports 5 days ago', 'Reports 6 days ago',
                      'Reports 7 days ago','Reports 14 days ago','Reports 30 days ago','Reports 365 days ago']}
num_hotspots = 5
num_hotspots_range = [5]
top_x = 5
expected_scores = {'5': {'Sensitivity': [1.0], 
                         'Specificity': [1.0], 
                         'Precision': [1.0], 
                         'F1': [1.0], 
                         'MCC': [1.0], 
                         'Lowest Misclassification Severity': [0.0]}}
expected_names = {'5': {'Sensitivity': ['Multi Layer Perceptron using F Regression dataset'], 
                        'Specificity': ['Multi Layer Perceptron using F Regression dataset'], 
                        'Precision': ['Multi Layer Perceptron using F Regression dataset'], 
                        'F1': ['Multi Layer Perceptron using F Regression dataset'], 
                        'MCC': ['Multi Layer Perceptron using F Regression dataset'], 
                        'Lowest Misclassification Severity': ['Multi Layer Perceptron using F Regression dataset']}}
test_evaluate_models_in_depth(num_hotspots_range, top_x, file_name, expected_scores, expected_names)

<h3> Test best x scores no graph <h3>

In [None]:
def test_best_x_scores_no_graph(score_metric, x, algorithms, results, num_hotspots, expected_scores, expected_names):
    scores, names = best_x_scores_no_graph(score_metric, x, algorithms, results, num_hotspots)
    assert scores == expected_scores, "Scores not as expected."
    for i in range(0,len(scores)):
        assert scores[i] == expected_scores[i], "Scores not as expected."
    assert names == expected_names, "Names not as expected."
    for i in range(len(names)):
        assert names[i] == expected_names[i], "Names not as expected."
    print("All tests completed successfully.")

In [None]:
algorithms, algorithm_display_names, feature_select_display_names, highest_is_best, score_metrics = setup_lists()
scores = {'Sensitivity': [1,2],
         'Specificity': [1],
         'Precision':[1],
         'F1':[1],
         'MCC':[1],
         'Lowest Misclassification Severity':[2,1,3,7,8,5,4]}
metric = 'Lowest Misclassification Severity'
x = 5
num_hotspots = 5
expected_best_scores = [1, 2, 3, 4, 5]
expected_names = ['Multi Layer Perceptron using Chi2 dataset', 
                  'Multi Layer Perceptron using F Regression dataset', 
                  'Multi Layer Perceptron using Adaboost dataset', 
                  'Decision Tree using Chi2 dataset', 
                  'Decision Tree using F Regression dataset']
test_best_x_scores_no_graph(metric, x, algorithms, scores, num_hotspots, expected_best_scores, expected_names)

<h3> Test get full analysis <h3>

In [None]:
def test_get_full_analysis(results, names, num_hotspots_range, expected_num_hotspots, expected_best, expected_r1,
                           expected_r2, expected_r3, expected_r4, expected_r5):
    table = get_full_analysis(results, names, num_hotspots_range)
    columns = table.columns
    for col in columns:
        assert col in expected_columns, "Table columns not as expected."
    for col in expected_columns:
        assert col in columns, "Table columns not as expected."
    num_hotspots = table['Number of Hotspots Predicted'].tolist()
    for i in range (0,len(num_hotspots)):
        assert num_hotspots[i] == expected_num_hotspots[i], "Number of hotspots column not as expected."
    best = table['Best Score'].tolist()
    for i in range (0,len(best)):
        assert best[i] == expected_best[i], "Best score column not as expected."
    r1 = table['Rank 1'].tolist()
    for i in range (0,len(r1)):
        assert r1[i] == expected_r1[i], "Rank 1 column not as expected."
    r2 = table['Rank 2'].tolist()
    for i in range (0,len(r2)):
        assert r2[i] == expected_r2[i], "Rank 2 column not as expected."
    r3 = table['Rank 3'].tolist()
    for i in range (0,len(r3)):
        assert r3[i] == expected_r3[i], "Rank 3 column  not as expected."
    r4 = table['Rank 4'].tolist()
    for i in range (0,len(r4)):
        assert r4[i] == expected_r4[i], "Rank 4 column  not as expected."
    r5 = table['Rank 5'].tolist()
    for i in range (0,len(r5)):
        assert r5[i] == expected_r5[i], "Rank 5 column  not as expected."
    print("All tests completed successfully")  

In [None]:
results = {'1': {'Sensitivity': [10,9,8,7,6,5,4,3,2,1],
                 'Specificity': [10,9,8,7,6,5,4,3,2,1],
                 'Precision': [10,9,8,7,6,5,4,3,2,1],
                 'F1': [10,9,8,7,6,5,4,3,2,1],
                 'MCC': [10,9,8,7,6,5,4,3,2,1],
                 'Lowest Misclassification Severity': [1,2,3,4,5,6,7,8,9,10]},
           '2': {'Sensitivity': [110,19,18,17,16,15,14,13,12,11],
                 'Specificity': [110,19,18,17,16,15,14,13,12,11],
                 'Precision': [110,19,18,17,16,15,14,13,12,11],
                 'F1': [110,19,18,17,16,15,14,13,12,11],
                 'MCC': [110,19,18,17,16,15,14,13,12,11],
                 'Lowest Misclassification Severity': [11,12,13,14,15,16,17,18,19,110]},
          '3': {'Sensitivity': [210,29,28,27,26,25,24,23,22,21],
                 'Specificity': [210,29,28,27,26,25,24,23,22,21],
                 'Precision': [210,29,28,27,26,25,24,23,22,21],
                 'F1': [210,29,28,27,26,25,24,23,22,21],
                 'MCC': [210,29,28,27,26,25,24,23,22,21],
                 'Lowest Misclassification Severity': [21,22,23,24,25,26,27,28,29,210]}}
names = {'1': {'Sensitivity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Specificity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Precision': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'F1': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'MCC': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
         '2': {'Sensitivity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Specificity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Precision': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'F1': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'MCC': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Lowest Misclassification Severity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"]},
         '3': {'Sensitivity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Specificity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Precision': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'F1': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'MCC': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"],
                 'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]}}
num_hotspots_range = [1,2]
expected_columns = ['Number of Hotspots Predicted','Best Score','Rank 1','Rank 2','Rank 3','Rank 4','Rank 5']
expected_num_hotspots = [1,2]
expected_best = [1,11]
expected_r1 = ['A10','A1']
expected_r2 = ['A9','A2']
expected_r3 = ['A8','A3']
expected_r4 = ['A7','A4']
expected_r5 = ['A6','A5']
test_get_full_analysis(results, names, num_hotspots_range, expected_num_hotspots, expected_best, expected_r1,
                       expected_r2, expected_r3, expected_r4, expected_r5)

<h3> Test get score by rank <h3>

In [None]:
def test_get_score_by_rank(scores, ranks, expected_scores):
    for i in range(0, len(ranks)):
        score = get_score_by_rank(scores, ranks[i])
        assert score == expected_scores[i], "Scores not as expected. i:" + str(i) + "Score:" + str(score) + "Exp Score:" + str(expected_scores[i])
    print("All tests completed successfully")  

In [None]:
scores = [1,1,1,2,3,3,4,4,4,4,4,5,5,5,5,5,6,7,8,9,10]
ranks = [1,2,3,4,5,6,7,8,9,10,11]
expected_scores = [1,2,3,4,5,6,7,8,9,10,0]
test_get_score_by_rank(scores, ranks, expected_scores)

<h3> Test Get Average Score Per Num Hotspots <h3>

In [None]:
def test_get_average_score_per_num_hotspots(algorithms, results, names, num_hotspots_range,expected_averages):
    averages = get_average_score_per_num_hotspots(algorithms, results, names, num_hotspots_range)
    assert averages == expected_averages, "Averages not as expected"
    for i in range (0, len(expected_averages)):
        assert averages[i] == expected_averages[i], "Averages not as expected"
    print("All tests completed successfully.")

In [None]:
algorithms = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10']
results = {'1': {'Lowest Misclassification Severity': [1,2,3,4,5,6,7,8,9,10]},
           '2': {'Lowest Misclassification Severity': [11,12,13,14,15,16,17,18,19,20]},
          '3': {'Lowest Misclassification Severity': [21,22,23,24,25,26,27,28,29,210]}}
names = {'1': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
         '2': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
         '3': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]}}
num_hotspots_range = [1, 2]
expected_averages = [15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0]
test_get_average_score_per_num_hotspots(algorithms, results, names, num_hotspots_range,expected_averages)

<h3> Test get best counts <h3>

In [None]:
def test_get_best_counts(results, names, num_hotspots_range, rank, expected_algorithms, expected_averages, expected_times_ranked,
                    expected_hotspots, expected_columns):
    table = get_best_counts(results, names, num_hotspots_range, rank)
    columns = table.columns
    for col in columns:
        assert col in expected_columns, "Table columns not as expected."
    for col in expected_columns:
        assert col in columns, "Table columns not as expected."
    algorithms = table['Algorithm'].tolist()
    for i in range (0,len(algorithms)):
        assert algorithms[i] == expected_algorithms[i], "Algorithm column not as expected."
    
    averages = table['Average Score'].tolist()
    for i in range (0,len(averages)):
        assert averages[i] == expected_averages[i], "Average Score column not as expected."
    
    times_ranked = table['Times Ranked 1'].tolist()
    for i in range (0,len(times_ranked)):
        assert times_ranked[i] == expected_times_ranked[i], "Times Ranked column not as expected."
    
    hotspots = table['Hotspot Values'].tolist()
    for i in range (0,len(hotspots)):
        assert hotspots[i] == expected_hotspots[i], "Hotspots column not as expected."
    print("All tests passed successfully")

In [None]:
results = {'1': {'Lowest Misclassification Severity': [1,2,3,4,5,6,7,8,9,10]},
           '2': {'Lowest Misclassification Severity': [2,3,4,5,6,7,8,9,10,11]},
          '3': {'Lowest Misclassification Severity': [3,4,5,6,7,8,9,10,11,12]},
          '4': {'Lowest Misclassification Severity': [4,5,6,7,8,9,10,11,12,13]},
          '5': {'Lowest Misclassification Severity': [5,6,7,8,9,10,11,12,13,14]},
          '6': {'Lowest Misclassification Severity': [6,7,8,9,10,11,12,13,14,15]},
          '7': {'Lowest Misclassification Severity': [7,8,9,10,11,12,13,14,15,16]},
          '8': {'Lowest Misclassification Severity': [8,9,10,11,12,13,14,15,16,17]},
          '9': {'Lowest Misclassification Severity': [9,10,11,12,13,14,15,16,17,18]},
          '10': {'Lowest Misclassification Severity': [10,11,12,13,14,15,16,17,18,19]}}
names = {'1': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
         '2': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
         '3': {'Lowest Misclassification Severity': ["A10","A9","A8","A7","A6","A5","A4","A3","A2","A1"]},
        '4': {'Lowest Misclassification Severity': ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10"]},
        '5': {'Lowest Misclassification Severity': ["A5","A4","A3","A2","A1","A7","A6","A8","A9","A10"]},
        '6': {'Lowest Misclassification Severity': ["A1","A10","A9","A8","A7","A6","A5","A4","A3","A2"]},
        '7': {'Lowest Misclassification Severity': ["A6","A1","A2","A3","A4","A5","A10","A7","A8","A9"]},
        '8': {'Lowest Misclassification Severity': ["A5","A1","A2","A3","A4","A10","A6","A7","A8","A9"]},
        '9': {'Lowest Misclassification Severity': ["A1","A5","A4","A3","A2","A6","A7","A9","A8","A10"]},
        '10': {'Lowest Misclassification Severity': ["A10","A1","A2","A3","A4","A5","A6","A7","A8","A9"]}}
num_hotspots_range = [1, 2, 3,4,5,6,7,8,9,10]
rank = 1
expected_columns = ["Algorithm", "Average Score", "Times Ranked 1", "Hotspot Values"]
expected_algorithms = ['A10','A1','A5','A6']
expected_averages = [9.4,8.9,9.1,10.0]
expected_times_ranked = [4,3,2,1]
expected_hotspots = ["1 2 3 10 ", "4 6 9 ", "5 8 ", "7 "]
test_get_best_counts(results, names, num_hotspots_range, rank, expected_algorithms, expected_averages, expected_times_ranked,
                    expected_hotspots, expected_columns)