In [1]:
import pandas as pd
import numpy as np
import glob
import os
import pickle

In [None]:
# load standard libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math

## Loading the Data
We load the data using a regular expression and a predetermined path. The data itself was downloaded from data.police.uk. Using regular expressions in `BASH` they were extracted into a seperated folder, for ease of use.

In [None]:
path = `<PATH OF DATA HERE>``
all_files = glob.glob(path + "/*.csv")

In [2]:
def concat_files_to_df(lstFiles):
    '''
    takes a lst of filenames, will transform these files into one dataframe
    note: it will only work with csv files with same columns, or pandas will break
    '''
    
    lst = []
    
    for filename in lstFiles:
        df = pd.read_csv(filename, index_col=None, header=0)
        lst.append(df)
        
    dataframe = pd.concat(lst, axis=0, ignore_index=True)
    return dataframe

In [None]:
all_data = concat_files_to_df(all_files)

## Preprocessing

In [None]:
df_start = all_data.copy()
N = len(df_start)

In [None]:
len(df_start['Crime ID'].unique()) / N

# conclusion ~33% of crimes have no ID

In [None]:
# replace the Month column, with integer values
df_start['year'] = pd.DatetimeIndex(df_start['Month']).year
df_start['month'] = pd.DatetimeIndex(df_start['Month']).month
df_start.drop('Month', axis=1, inplace=True)

In [None]:
# drop these two columns as they are useless
df_start.drop(['Reported by', 'Falls within'], axis=1, inplace=True)

In [None]:
# we have multiple variables that give location data, location as a column is non specific and needs to be encoded on top to be worked with
# therefore we can drop the column
df_start.drop('Location', axis=1, inplace=True)

In [None]:
# only one LSOA identifier is enough
df_start.drop('LSOA code', axis=1, inplace=True)

In [None]:
# one third of the dataset does not have an outcome category, as we are currently NOT specifying crime type that importantly
# we can drop the column now, but if we think crime type is very important, we will have to add it back in, for more nuance
# we would also have to further look into the nan values then
df_start[df_start['Last outcome category'].isna()]

In [None]:
df_start.drop('Last outcome category', axis=1, inplace=True)

After analysing the `Context` column, it can be concluded that some crimes have been allocated the nearest possible location, as the true location of the crime could not be properly mapped. Therefore, as the location is not precise, we consider this data to be faulty and we will delete these data entries. 

In [4]:
# only keep non context values
df_start = df_start[df_start['Context'].isna()].copy()

In [None]:
# now we can drop the Context column, as it doesn't hold anymore information
df_start.drop('Context', axis=1, inplace=True)

In [None]:
# after some consideration, we decided to drop the crime ID column for now, it can easily be put back in if we want to change our approach
df_start.drop('Crime ID', axis=1, inplace=True)

### Adding extra features

We start with adding the median incomes

In [None]:
df_police = df_start.copy()

In [None]:
# Get a set of all of the areas not in the Greater Manchester Area

df_police['Borough'] = df_police['LSOA name'].str[:-5] # Add first part of LSOA name as borough in the dataframe

lst_boroughs = ["Manchester", "Salford", "Bolton", "Bury", "Oldham", "Rochdale", "Stockport", 
                "Tameside", "Trafford", "Wigan"] # List contains all the boroughs of the Greater Manchester Area

lst_boroughs_in_df = df_police['Borough'].unique() # List contains all unique boroughs in the df_police dataframe

set_incorrect_boroughs = set(lst_boroughs_in_df) - set(lst_boroughs) # Set contains all areas that are in the dataframe but are not in the Greater Manchester Area

In [None]:
# Remove areas that don't fall within the Greater Manchester Area

for borough in set_incorrect_boroughs:
    df_police = df_police[df_police['Borough'] != borough]

In [None]:
# Get Median Annual Gross Pay data

df_pay = pd.read_excel('AnnualPayGrossManchester.xlsx')
df_pay = df_pay.set_index('Region')
df_pay

In [None]:
# Get unique years in the df_police

df_police = df_police.dropna() # Drop row with a NaN value for Borough
lst_years = df_police['year'].unique() # List contains all unique years in df_police dataframe

In [None]:
# Add Median Annual Gross Pay to rows dependent of borough and year

df = pd.DataFrame() # Create empty dataframe to add agp to all combinations of boroughs and years seperately

for borough in lst_boroughs:
    df_borough = df_police.copy()
    df_borough = df_borough[df_borough['Borough'] == borough]
    for year in lst_years:
        df_year = df_borough.copy()
        df_year = df_year[df_year['year'] == year]
        df_year['magp'] = dict(df_pay)[year]['  ' + borough]
        df = pd.concat([df, df_year])


df_police = df

Add the average age

In [None]:
def get_avg_age_per_year(file_name):
    
    df = pd.read_excel(file_name)
    
    lst = []
    lst.append("LSOA Name")
    for i in range(90):
        lst.append(i)
    lst.append("90+")
    
    df = df[lst]
    
    df['Borough'] = df['LSOA Name'].str[:-5]
    lst = ["Manchester", "Salford", "Bolton", "Bury", "Oldham", "Rochdale", "Stockport", "Tameside", "Trafford", "Wigan"]
    lst2 = df['Borough'].unique()
    s1 = set(lst2) - set(lst)
    
    for i in s1:
        df = df[df['Borough'] != i]
    
    df = df.dropna()
    
    df = df.set_index('LSOA Name')
    df = df.drop(columns=["Borough"])
    df = df.rename(columns={'90+': 90})
    
    dic = dict()
    for lsoa in df.index.unique():
        dic[lsoa] = (df.T[lsoa] * df.T.index).sum() / df.T[lsoa].sum()
        
    df['avg'] = pd.Series(dic)
    df = df['avg']
    output = dict(df)
    
    return output

In [None]:
def add_avg_age_per_year(files):
    
    lst_years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
    df_return = pd.DataFrame()
    
    for year, file in zip(lst_years, files):
        df = df_police[df_police['year'] == year]
        df['avg age'] = df['LSOA name'].map(get_avg_age_per_year(file))
        df_return = pd.concat([df_return, df])
        
    return df_return

In [None]:
lst_files = ['age_2012.xlsx', 'age_2012.xlsx', 'age_2013.xlsx', 'age_2014.xlsx', 'age_2015.xlsx', 'age_2016.xlsx',
             'age_2017.xlsx', 'age_2018.xlsx', 'age_2019.xlsx', 'age_2020.xlsx']
df = add_avg_age_per_year(lst_files)
# File for 2011 uses age groups and can therefore not be used to calculate an accurate average
# I suggest using the 2012 data

## Clustering


In [None]:
# raw_data = pd.read_pickle("crime.pickle")
raw_data = df.copy()

In [None]:
dct = {"Longitude": "longitude", "Latitude": "latitude", "LSOA name": "LSOA", "Crime type": "crime_type", 
           "magp": "median_income", "avg age": "avg_age", "Borough": "borough", "Cluster": "cluster", "Season": "season"}
raw_data.rename(columns=dct, inplace=True)

In [None]:
def seasonal_data(df):
    """
    Extracts the different seasons in the data
    """
    spring = df[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5)]
    summer = df[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8)]
    fall = df[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11)]
    winter = df[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2)]
    return spring, summer, fall, winter

def locational_data(df, x='longitude', y='latitude'):
    """
    Extracs just location data
    """
    return df[[x, y]]

In [None]:
# we make a copy before proceeding
data = raw_data[(raw_data["year"] != 2019) & (raw_data["year"] != 2010)]

In [None]:
df_clustering = data.sort_values(["month", "year"], ascending=(True, True))

In [None]:
df_clustering.reset_index(inplace=True)

In [None]:
# split off the seasons, this was done to check how hotspots behaved here, this is not part of the main script, but it's shown here for reference how this was obtained
spring, summer, fall, winter = seasonal_data(df_clustering)

In [None]:
spring_loc = locational_data(spring)
summer_loc = locational_data(summer)
fall_loc = locational_data(fall)
winter_loc = locational_data(winter)
cluster_loc = locational_data(df_clustering)

We continue from here on using the cluster_loc data, the seasonal data is invalid for the final project. We sample the data to severly speed up the KMeans Constrained algorithm. As we are random sampling this much data, the clusters are still representative. To extrapolate them we will use a `Gradient Booster`, this had a 99.6% accuracry in assigning the clusters. This is lots faster than training the KMeans Constrained for the added date, on top of the many, many, many memory errors we ran into.

In [5]:
def sampler(df, x="Longitude", y="Latitude", frc=0.1):
    """
    samples data
    """
    sample_loc = df.sample(frac=frc, random_state=42)
    return sample_loc

In [None]:
sample_loc = sampler(cluster_loc, frc=0.1)

Important background information, to find this main hotspot, so called data shader plots were used. However, the implementation of them in this main script is difficult. They use a library depency called `Numba`, this library has a conflicting `numpy` version dependency with `KMeans Constrained`. As the latter is more important for our overal performance this was ommited from the final result. The code to get a plot like this is added, but it is commented out so it doesn't crash.

In [None]:
# import datashader as ds
# from datashader.mpl_ext import dsshow


# def using_datashader(ax, x, y, mx=80):
#     df = pd.DataFrame(dict(x=x, y=y))
#     dsartist = dsshow(
#         df,
#         ds.Point("x", "y"),
#         ds.count(),
#         vmin=0,
#         vmax=mx,
#         norm="linear",
#         aspect="auto",
#         ax=ax,
#     )

#     plt.colorbar(dsartist)


# # fig, ax = plt.subplots()
# # using_datashader(ax, loc_x, loc_y)
# # fig.set_size_inches(18.5, 10.5)

# # ax.annotate('axes fraction',
# #             xy=(-2.22, 53.48),
# #             xytext=(-2.22, 53.48), textcoords='X')

# # # Clusters:
# # # (-2.18, 53.63)
# # # (-2.13, 53.54)
# # # (-2.1, 53.48) - klein
# # # (-2.28, 53.58)
# # # (-2.42, 53.57)

# # plt.text(-2.2425, 53.48, "X", size=16, weight='bold')
# # plt.text(-2.167, 53.615, "X", size=14)
# # plt.text(-2.123, 53.54, "X", size=14)
# # plt.text(-2.3, 53.59, "X", size=14)
# # plt.text(-2.438, 53.576, "X", size=14)
# # plt.text(-2.1, 53.49, "X", size=12)

# # plt.show()

In [None]:
def main_hotspot_grid(df, x1=-2.22, x2=-2.26, y1=53.46, y2=53.49):
    """
    Returns the data regarding the main hotspot cluster in the GMA
    """
    return df[(df['latitude'] > y1) & (df['latitude'] < y2) & (df['longitude'] > x2) & (df['longitude'] < x1)]

In [6]:
print('The following holds for the data')
hotspot = main_hotspot_grid(sample_loc)
print(f'There are roughly {len(hotspot)} data points in the main hotspot, suggesting an esmitation for the max cluster size')
print(f'There is a total of {len(sample_loc)} data points')
print(f'The minimum number of clusters from this simple calculation would be: {math.ceil(len(sample_loc)/len(hotspot))}')

The following holds for the data


NameError: name 'main_hotspot_grid' is not defined

In [None]:
clusters_array = np.array(sample_loc)

In [None]:
from k_means_constrained import KMeansConstrained

clf_clusters = KMeansConstrained(
     n_clusters=13,
     size_min=None,
     size_max=22000,
     random_state=42,
     n_jobs=-1
)
clf_clusters.fit_predict(clusters_array)

sample_labels = clf_clusters.labels_

In [None]:
# add the clusters to the appropriate labels
sample_loc['cluster'] = pd.Series(sample_labels.copy(), index=sample_loc.index)

In [None]:
from sklearn.model_selection import train_test_split

data_x = sample_loc.drop("cluster", axis=1)
data_y = sample_loc["cluster"]

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier()
gbrt.fit(X_train, y_train)

In [None]:
y_pred = gbrt.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

prec_score = precision_score(y_test, y_pred, average="weighted", zero_division=0)
rec_score = recall_score(y_test, y_pred, average="weighted", zero_division=0)
F1_score = f1_score(y_test, y_pred, average="weighted", zero_division=0)
acc_score = accuracy_score(y_test, y_pred)

print(f'The accuracy of the model is {round(acc_score, 4)}.')
print(f'The precision of the model is {round(prec_score, 4)}, using weighted average.')
print(f'The recall of the model is {round(rec_score, 4)}, using weighted average.')
print(f'The f1-score of the model is {round(F1_score, 4)} using weighted average.')

In [None]:
loc_test = locational_data(data)

In [None]:
# extrapolating the clusters
test_predict = gbrt.predict(loc_test)

In [None]:
# adding the clusters to the data
data['cluster'] = pd.Series(test_predict.copy(), index=data.index)

In [None]:
def find_season(month, hemisphere="Northern"):
    """
    Adding seasons to the data set, depending on the hemisphere
    """
    if hemisphere == 'Southern':
        season_month_south = {
            12:'Summer', 1:'Summer', 2:'Summer',
            3:'Autumn', 4:'Autumn', 5:'Autumn',
            6:'Winter', 7:'Winter', 8:'Winter',
            9:'Spring', 10:'Spring', 11:'Spring'}
        return season_month_south.get(month)
        
    elif hemisphere == 'Northern':
        season_month_north = {
            12:'Winter', 1:'Winter', 2:'Winter',
            3:'Spring', 4:'Spring', 5:'Spring',
            6:'Summer', 7:'Summer', 8:'Summer',
            9:'Autumn', 10:'Autumn', 11:'Autumn'}
        return season_month_north.get(month)
    else:
        print('Invalid selection. Please select a hemisphere and try again')

In [None]:
season_list = []
hemisphere = 'Northern'
for month in data.month:
    season = find_season(month, hemisphere)
    season_list.append(season)
    
data['season'] = season_list

## Models & Adding some Time Features

In [None]:
dct = {"Longitude": "longitude", "Latitude": "latitude", "LSOA name": "LSOA", "Crime type": "crime_type", 
           "magp": "median_income", "avg age": "avg_age", "Borough": "borough", "Cluster": "cluster", "Season": "season"}
data.rename(columns=dct, inplace=True)

In [None]:
# change cluster 0 to 13, just looks nicer
data.loc[data.cluster==0, "cluster"] = 13

In [None]:
# adding previous month to the data for an easier identifier 
data["prev_month"] = np.nan

lst_months = sorted(data.month.unique().tolist())
lst_prev = [12] + lst_months[:11]


i = 0
while i < 12:
    data.loc[data.month == lst_months [i], "prev_month"] = lst_prev[i]
    i += 1

data.prev_month = data.prev_month.astype(int)

In [None]:
# adding different types of crime statistics, currently the only one that is used is total monthly crime `monthly`
# this however easily could be build upon for different models/time series analyses given more time

# year
data["yearly"] = data.groupby(["year", "cluster"])["crime_type"].transform("count")
data["yearly_crime"] = data.groupby(["year", "crime_type", "cluster"])["crime_type"].transform("count")

# month
data["monthly"] = data.groupby(["year", "month", "cluster"])["crime_type"].transform("count")
data["monthly_crime"] = data.groupby(["year", "month", "crime_type", "cluster"])["crime_type"].transform("count")

# seasonality
data["seasonality"] = data.groupby(["year", "season", "cluster"])["crime_type"].transform("count")
data["seasonality_crime"] = data.groupby(["year", "season", "crime_type", "cluster"])["crime_type"].transform("count")

In [None]:
# we are going to add a time feature that has a rolling window, therefore we need to do some data wrangling steps
# due to the nature of the clustering, the data wrangling was a bit complicated and I did not find a way to do it
# more elegant then using 3 for loops (technically calling the function every time for a new cluster is also a for loop)
# however having the monthly data in seperate lists is usefull regardless to calculate the rolling mean more easily
# reminder, there are 13! clusters in total


lst_years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
clusters = sorted(data.cluster.unique().tolist())
lst_months

def rolling_mean(data, list_years=lst_years, list_months=lst_months, size=4):
    """
    Outputs a list of the monthly crime per month sorted on time, this is used
    to then calculate a rolling mean for this particular cluster
    param: dataframe, this should only contain the cluster specific data
    list of years, list of months, rolling window size
    return: lst of the resulting rolling mean for this cluster
    """
    lst = []

    for i in list_years:
        for j in list_months:   
            lst.append(data.loc[(data.year==i) & (data.month==j)].monthly.unique()[0])
    

    windows = pd.Series(lst).rolling(size).mean()
    return windows.tolist()

In [None]:
# lists that represent the rolling means, just a number behind lst is used to indicated the cluster
# we can use previous months also in the test set as they will be known to the algorithm at the time of prediction

lst1 = rolling_mean(data.loc[data.cluster==1])
lst2 = rolling_mean(data.loc[data.cluster==2])
lst3 = rolling_mean(data.loc[data.cluster==3])
lst4 = rolling_mean(data.loc[data.cluster==4])
lst5 = rolling_mean(data.loc[data.cluster==5])
lst6 = rolling_mean(data.loc[data.cluster==6])
lst7 = rolling_mean(data.loc[data.cluster==7])
lst8 = rolling_mean(data.loc[data.cluster==8])
lst9 = rolling_mean(data.loc[data.cluster==9])
lst10 = rolling_mean(data.loc[data.cluster==10])
lst11 = rolling_mean(data.loc[data.cluster==11])
lst12 = rolling_mean(data.loc[data.cluster==12])
lst13 = rolling_mean(data.loc[data.cluster==13])

In [None]:
data[["rolling_months"]] = np.nan

In [None]:
def rolling_mean_adder(data, lst , Z, list_years=lst_years, list_months=lst_months):
    """
    Adds the rolling mean values back into the dataframe
    param: dataframe of all the data, lst containing the cluster rolling mean
    list of years, list of months, rolling window size, Z an integer representing the cluster
    return: None
    """
    q = 0

    while q < len(lst):
        for i in list_years:
            for j in list_months:
                data.loc[(data.year==i) & (data.month==j) & (data.cluster==Z), "rolling_months"] = lst[q]
                q += 1

In [None]:
# lst number corresponding to cluster number to add data to the clusters

rolling_mean_adder(data, lst1, 1)
rolling_mean_adder(data, lst1, 2)
rolling_mean_adder(data, lst1, 3)
rolling_mean_adder(data, lst1, 4)
rolling_mean_adder(data, lst1, 5)
rolling_mean_adder(data, lst1, 6)
rolling_mean_adder(data, lst1, 7)
rolling_mean_adder(data, lst1, 8)
rolling_mean_adder(data, lst1, 9)
rolling_mean_adder(data, lst1, 10)
rolling_mean_adder(data, lst1, 11)
rolling_mean_adder(data, lst1, 12)
rolling_mean_adder(data, lst1, 13)

In [None]:
# add prev months crime rate this should be less effective then rolling window
data["prev_month_crime"] = np.nan

j = 0
while j < 12:
    data.loc[data.month == lst_months[j], "prev_month_crime"] = data.loc[data.prev_month == lst_prev[j]].monthly.unique()[0]
    j += 1

In [None]:
# timestamp was dropped in preprocessing, but for datatime ease we add it back in, all data was added on first day of month, so we can assing this
# this was done to try and train an ARIMA model, this model gave RAM errors asking for 300GB+ of RAM
data["day"] = 1

data["datetime"] = pd.to_datetime(data[["year", "month", "day"]])

In [None]:
# as data is timeseries we sort it one more time to be sure
data.sort_values("datetime", inplace=True)

In [None]:
# get rid of the incomplete years as they can't be used properly currently
data = data[~data.year.isin([2011, 2019])]

In [None]:
# split train and test based off whole years
train = data[~(data.year == 2018) & ~(data.year == 2017)]
test = data[(data.year == 2018) | (data.year == 2017)]

In [None]:
def seasonality_pick(data, season,season_crime):
    """
    dataframe gets transformed to represent the seasonality wanted
    """
    return data[["crime_type", "datetime", "borough", "median_income", "avg_age", "cluster", "year", "season", "month", "rolling_months", "prev_month_crime",season, season_crime]].copy()

In [None]:
# pick the training sets 
train_month = seasonality_pick(train, "monthly", "monthly_crime")
# train_year = seasonality_pick(train, "yearly", "yearly_crime")
# train_season = seasonality_pick(train, "seasonality", "seasonality_crime")

In [None]:
# pick the test sets 
test_month = seasonality_pick(test, "monthly", "monthly_crime")
# test_year = seasonality_pick(test, "yearly", "yearly_crime")
# test_season = seasonality_pick(test, "seasonality", "seasonality_crime")

### Encoding

In [None]:
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

# not using the Pipeline for now, as we don't want to scale the rolling_months and I don't have time to figure out how to pass this
# the Pipeline that is displayed is wrong and has to be build out with different Encoders regardless

# num_pipeline = Pipeline([
#     ("std_scaler", StandardScaler())
# ])

In [None]:
# encoders
ord_enc = OrdinalEncoder()
borough_enc = LabelEncoder()
crime_enc = LabelEncoder()
inc_scl = StandardScaler()
age_scl = StandardScaler()

In [None]:
# fit and transform encoding on the train set
train_month.season = ord_enc.fit_transform(train_month[["season"]])
train_month.crime_type = crime_enc.fit_transform(train_month["crime_type"])
train_month.borough = borough_enc.fit_transform(train_month["borough"])
train_month.median_income = inc_scl.fit_transform(train_month[["median_income"]])
train_month.avg_age = age_scl.fit_transform(train_month[["avg_age"]])

In [None]:
# transform the test set
test_month.season = ord_enc.transform(test_month[["season"]])
test_month.crime_type = crime_enc.transform(test_month["crime_type"])
test_month.borough = borough_enc.transform(test_month["borough"])
test_month.median_income = inc_scl.transform(test_month[["median_income"]])
test_month.avg_age = age_scl.transform(test_month[["avg_age"]])

### Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [None]:
# test sets are intentionally commented out
X_train = train_month[["crime_type", "borough", "median_income", "avg_age", "season", "month", "rolling_months"]]
# X_test = test_clst1[["crime_type", "borough", "median_income", "avg_age", "season", "month", "rolling_months"]]
y_train = train_month["monthly"]
# y_test = test_clst1["monthly"]

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Helper functions

def time_metrics(y_test, y_pred, model):
    """
    print the time metrics
    """
    print(f"Error metrics for the {model} prediction model")
    print("\n")
    print('Mean Absolute Error:', round(mean_absolute_error(y_test, y_pred), 3))
    print('Mean Squared Error:', round(mean_squared_error(y_test, y_pred), 3))
    print('Root Mean Squared Error:', round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))
    print('R2 score:', round(r2_score(y_test, y_pred), 3))

def test_model(data, model, model_name):
    """
    Tests the outcomes of the trained model per cluster
    param: general test set, the variable storing the model function, model name as a string
    return: prints of the error metrics
    """
    for i in clusters:
        test = data.loc[data.cluster==i]
        X_test = test[["crime_type", "borough", "median_income", "avg_age", "season", "month", "rolling_months"]]
        y_test = test["monthly"]
        y_pred = model.predict(X_test)
        print("\n")
        print(f"Cluster {i}")
        time_metrics(y_test, y_pred, model_name)

#### Linear

In [None]:
lin_reg = LinearRegression();
lin_reg.fit(X_train, y_train);

In [None]:
test_model(test_month, lin_reg, "Linear")

#### Random Forest

In [None]:
rdf_reg = RandomForestRegressor()
rdf_reg.fit(X_train, y_train);

In [None]:
test_model(test_month, rdf_reg, "Random Forest")

#### Gradient Boosters

xgb_reg = GradientBoostingRegressor()
xgb_reg.fit(X_train, y_train)

In [None]:
test_model(test_month, xgb_reg, "Gradiant Boosting")

In [None]:
import xgboost as xgb

# we ignore future warnings so the outcomes become more readable
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
xtr_bst = xgb.XGBRFRegressor()
xtr_bst.fit(X_train, y_train)

In [None]:
test_model(test_month, xtr_bst, "Extreme Gradient Boosting")