In [170]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [171]:
# open crime dataset
def open_sfpd():
    df = pd.read_csv('SFPD_preprocessed.csv')
    return df

In [172]:
# open business dataset
def open_businesses():
    df = pd.read_csv('businesses_preprocessed.csv')
    return df

In [173]:
# open business openings dataset
def open_business_openings():
    df = pd.read_csv('businesses_openings.csv')
    return df

In [174]:
# open business closures dataset
def open_business_closures():
    df = pd.read_csv('businesses_closures.csv')
    return df

In [175]:
# merge the four datasets into one, renaming columns where necessary
def finalise_data(sfpd_data,businesses,openings,closures):
    sfpd_data.rename(columns={"1 day ago": "Crime 1 day ago",
                              "2 days ago": "Crime 2 days ago",
                              "3 days ago": "Crime 3 days ago",
                              "4 days ago": "Crime 4 days ago",
                              "5 days ago": "Crime 5 days ago",
                              "6 days ago": "Crime 6 days ago",
                              "7 days ago": "Crime 7 days ago",
                              "14 days ago": "Crime 14 days ago",
                              "30 days ago": "Crime 30 days ago",
                              "365 days ago": "Crime 365 days ago",
                              "Last 7 days": "Last 7 days crime",
                              "Last 14 days": "Last 14 days crime",
                              "Last 28 days": "Last 28 days crime"})
    sfpd_data['Number of businesses'] = businesses['Number of businesses']
    sfpd_data['Businesses 1 day ago'] = businesses['Businesses 1 day ago']
    sfpd_data['Businesses 2 days ago'] = businesses['Businesses 2 days ago']
    sfpd_data['Businesses 3 days ago'] = businesses['Businesses 3 days ago']
    sfpd_data['Businesses 4 days ago'] = businesses['Businesses 4 days ago']
    sfpd_data['Businesses 5 days ago'] = businesses['Businesses 5 days ago']
    sfpd_data['Businesses 6 days ago'] = businesses['Businesses 6 days ago']
    sfpd_data['Businesses 7 days ago'] = businesses['Businesses 7 days ago']
    sfpd_data['Businesses 14 days ago'] = businesses['Businesses 14 days ago']
    sfpd_data['Businesses 30 days ago'] = businesses['Businesses 30 days ago']
    sfpd_data['Businesses 365 days ago'] = businesses['Businesses 365 days ago']
    sfpd_data['Number of closures'] = closures['Closures']
    sfpd_data['Closures 1 day ago'] = closures['Closures 1 day ago']
    sfpd_data['Closures 2 days ago'] = closures['Closures 2 days ago']
    sfpd_data['Closures 3 days ago'] = closures['Closures 3 days ago']
    sfpd_data['Closures 4 days ago'] = closures['Closures 4 days ago']
    sfpd_data['Closures 5 days ago'] = closures['Closures 5 days ago']
    sfpd_data['Closures 6 days ago'] = closures['Closures 6 days ago']
    sfpd_data['Closures 7 days ago'] = closures['Closures 7 days ago']
    sfpd_data['Closures 14 days ago'] = closures['Closures 14 days ago']
    sfpd_data['Closures 30 days ago'] = closures['Closures 30 days ago']
    sfpd_data['Closures 365 days ago'] = closures['Closures 365 days ago']
    sfpd_data['Last 7 days closures'] = closures['Last 7 days closures']
    sfpd_data['Last 14 days closures'] = closures['Last 14 days closures']
    sfpd_data['Last 28 days closures'] = closures['Last 28 days closures']
    sfpd_data['Number of openings'] = openings['Openings']
    sfpd_data['Openings 1 day ago'] = openings['Openings 1 day ago']
    sfpd_data['Openings 2 days ago'] = openings['Openings 2 days ago']
    sfpd_data['Openings 3 days ago'] = openings['Openings 3 days ago']
    sfpd_data['Openings 4 days ago'] = openings['Openings 4 days ago']
    sfpd_data['Openings 5 days ago'] = openings['Openings 5 days ago']
    sfpd_data['Openings 6 days ago'] = openings['Openings 6 days ago']
    sfpd_data['Openings 7 days ago'] = openings['Openings 7 days ago']
    sfpd_data['Openings 14 days ago'] = openings['Openings 14 days ago']
    sfpd_data['Openings 30 days ago'] = openings['Openings 30 days ago']
    sfpd_data['Openings 365 days ago'] = openings['Openings 365 days ago']
    sfpd_data['Last 7 days openings'] = openings['Last 7 days openings']
    sfpd_data['Last 14 days openings'] = openings['Last 14 days openings']
    sfpd_data['Last 28 days openings'] = openings['Last 28 days openings']
    sfpd_data = sfpd_data.drop(columns=['Incident Date','Incident Year'])
    return sfpd_data

In [176]:
# convert day of week column to one hot encoded columns
def convert_day_of_week(df):
    dummies = pd.get_dummies(df['Incident Day of Week'])
    merged = pd.concat([df,dummies],axis='columns')
    final = merged.drop(['Incident Day of Week','Monday'],axis='columns')
    return final

In [178]:
# convert neighbourhood column to one hot encoded columns
def convert_neighborhood(df):
    dummies = pd.get_dummies(df['Neighborhood'])
    merged = pd.concat([df,dummies],axis='columns')
    final = merged.drop(['Neighborhood','Ashbury Heights'],axis='columns')
    return final

In [180]:
# normalise all columns with min max normalisation
def normalise_data(df):
    y_data = df[['Todays Reports']]
    x_data = df.drop(columns=['Todays Reports'])
    x_data = x_data.apply(lambda x: (x - x.min(axis = 0)) / (x.max(axis = 0) - x.min(axis = 0)))
    x_data['Todays Reports'] = y_data['Todays Reports']
    return x_data

In [181]:
sfpd_data = open_sfpd()

In [182]:
businesses = open_businesses()

In [183]:
openings = open_business_openings()

In [184]:
closures = open_business_closures()

In [185]:
final_data = finalise_data(sfpd_data,businesses,openings,closures)

In [186]:
final_data = convert_neighborhood(final_data)

In [187]:
final_data = convert_day_of_week(final_data)

In [188]:
final_data

Unnamed: 0,Todays Reports,Reports 1 day ago,Reports 2 days ago,Reports 3 days ago,Reports 4 days ago,Reports 5 days ago,Reports 6 days ago,Reports 7 days ago,Reports 14 days ago,Reports 30 days ago,...,Western Addition,Westwood Highlands,Westwood Park,Yerba Buena Island,Friday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2.0,1.0,0.0,2.0,1.0,1.0,3.0,4.0,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,1.0,0.0,2.0,0.0,4.0,2.0,2.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,2.0,5.0,0.0,6.0,2.0,1.0,1.0,3.0,6.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58495,0.0,0.0,0.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
58496,0.0,3.0,0.0,3.0,7.0,4.0,6.0,5.0,5.0,5.0,...,1,0,0,0,0,0,0,0,0,1
58497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
58498,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,1,0,0,0,0,0,0,1


In [189]:
final_data = normalise_data(final_data)

In [190]:
final_data.to_csv('finalised_data.csv', index = False)