In [18]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# load dataset
csv_file = '../../datasets/chicago_trimmed_10000.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude
0,2010-10-03 11:30:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926
1,2005-10-31 03:55:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876
2,2003-10-07 02:30:00+00:00,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098
3,2009-01-22 02:30:00+00:00,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845
4,2020-07-03 12:15:00+00:00,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214


In [4]:
df['primary_type'].unique()

array(['BATTERY', 'ASSAULT', 'NARCOTICS', 'OTHER OFFENSE',
       'CRIMINAL TRESPASS', 'ROBBERY', 'CRIM SEXUAL ASSAULT', 'BURGLARY',
       'DECEPTIVE PRACTICE', 'WEAPONS VIOLATION', 'STALKING',
       'PUBLIC PEACE VIOLATION', 'SEX OFFENSE', 'THEFT',
       'MOTOR VEHICLE THEFT', 'OFFENSE INVOLVING CHILDREN',
       'PROSTITUTION', 'ARSON', 'LIQUOR LAW VIOLATION', 'GAMBLING',
       'INTERFERENCE WITH PUBLIC OFFICER', 'KIDNAPPING',
       'CRIMINAL SEXUAL ASSAULT', 'OBSCENITY', 'HOMICIDE',
       'CRIMINAL DAMAGE', 'INTIMIDATION', 'OTHER NARCOTIC VIOLATION'],
      dtype=object)

In [9]:
# Creating category of combined primary type and arrested or not
df['primary_type_arrest_cat'] = df['primary_type'] + df['arrest'].astype(str)

# removing categories with only one entry
df = df.groupby('primary_type_arrest_cat').filter(lambda x : len(x) > 20)
df['primary_type_arrest_cat'].value_counts()

BATTERYFalse                            2466
ASSAULTFalse                             932
NARCOTICSTrue                            825
BATTERYTrue                              754
THEFTFalse                               670
OTHER OFFENSEFalse                       530
CRIMINAL DAMAGEFalse                     522
ROBBERYFalse                             519
DECEPTIVE PRACTICEFalse                  442
BURGLARYFalse                            407
MOTOR VEHICLE THEFTFalse                 298
ASSAULTTrue                              250
THEFTTrue                                178
OFFENSE INVOLVING CHILDRENFalse          113
CRIMINAL TRESPASSTrue                     91
OTHER OFFENSETrue                         87
CRIMINAL TRESPASSFalse                    83
DECEPTIVE PRACTICETrue                    80
WEAPONS VIOLATIONTrue                     59
CRIM SEXUAL ASSAULTFalse                  59
PUBLIC PEACE VIOLATIONTrue                59
ROBBERYTrue                               56
CRIMINAL D

In [10]:
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,primary_type_arrest_cat
0,2010-10-03 11:30:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,BATTERYFalse
1,2005-10-31 03:55:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876,BATTERYFalse
2,2003-10-07 02:30:00+00:00,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098,BATTERYTrue
3,2009-01-22 02:30:00+00:00,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845,BATTERYFalse
4,2020-07-03 12:15:00+00:00,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214,BATTERYFalse


Splitting to test and train

Splitting test to test and validation set

In [11]:
def stratified_shuffle_split_train_test_valid(df):

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)

    for train_index, test_index in split.split(df, df['primary_type_arrest_cat']):
        train_set = df.loc[train_index]
        test_valid_set = df.loc[test_index]
    split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

    for train_index, test_index in split2.split(test_valid_set, test_valid_set['primary_type_arrest_cat']):
        test_set = df.loc[train_index]
        valid_set = df.loc[test_index]
        
    return train_set, test_set, valid_set


In [12]:
train_set, test_set, valid_set = stratified_shuffle_split_train_test_valid(df)

Check to see that the proportions of the dataset is still the same

In [14]:
full_df = df["primary_type_arrest_cat"].value_counts() / len(df)
train_df = train_set["primary_type_arrest_cat"].value_counts() / len(train_set)
test_df = test_set["primary_type_arrest_cat"].value_counts() / len(test_set)
valid_df = valid_set["primary_type_arrest_cat"].value_counts() / len(valid_set)

print(f'Full dataset\n{full_df} \n')
print(f'Train dataset\n{train_df} \n')
print(f'Test dataset\n{test_df} \n')
print(f'Validation dataset\n{valid_df} \n')

Full dataset
BATTERYFalse                            0.250559
ASSAULTFalse                            0.094696
NARCOTICSTrue                           0.083824
BATTERYTrue                             0.076610
THEFTFalse                              0.068076
OTHER OFFENSEFalse                      0.053851
CRIMINAL DAMAGEFalse                    0.053038
ROBBERYFalse                            0.052733
DECEPTIVE PRACTICEFalse                 0.044910
BURGLARYFalse                           0.041353
MOTOR VEHICLE THEFTFalse                0.030278
ASSAULTTrue                             0.025401
THEFTTrue                               0.018086
OFFENSE INVOLVING CHILDRENFalse         0.011481
CRIMINAL TRESPASSTrue                   0.009246
OTHER OFFENSETrue                       0.008840
CRIMINAL TRESPASSFalse                  0.008433
DECEPTIVE PRACTICETrue                  0.008128
WEAPONS VIOLATIONTrue                   0.005995
CRIM SEXUAL ASSAULTFalse                0.005995
PUBLIC 