### Import required libraries

In [1]:
# start with common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read in the data

In [2]:
data = pd.read_csv('../data/homicide.csv')

data.shape

(1542, 13)

In [3]:
data[:5]

Unnamed: 0,AREANAME,VictAge,VictSex,VictDescent,PremisDesc,WeaponDesc,YEAR,MONTH,DAY,HOUR,TIME_OF_DAY,REPORTING_DELAY,target
0,Central,57,M,H,STAIRWELL*,UNKNOWN TYPE CUTTING INSTRUMENT,2020,5,15,19,Evening,1,1
1,Newton,31,M,A,SIDEWALK,UNKNOWN FIREARM,2020,7,23,21,Evening,1,0
2,77th Street,19,M,B,STREET,UNKNOWN FIREARM,2020,6,17,50,,0,1
3,Northeast,37,M,W,STREET,BLUNT INSTRUMENT,2020,9,12,12,Morning,0,0
4,Newton,30,F,B,SIDEWALK,UNKNOWN FIREARM,2020,11,15,15,Afternoon,0,1


### Natural Language Processing
We will execute a few techniques with the text data to make it ready for the model. We will use the following techniques:
- Tokenization - converting text into tokens
- Removing Stopwords - removing common words that will likely appear in any text
- Lemmatization - converting words to their base form
- Stemming - reducing words to their root form
- n-grams - grouping words together (for example, instead of having "good" and "movie" as separate tokens, we can have "good movie" as one token)

### First we will create custom transformers to perform these tasks

### Split the data into training, validation and test sets

In [4]:
from fast_ml.model_development import train_valid_test_split

# Split the data
X = data.drop('target', axis=1)
y = data['target']

# define the train, validation and test size
train_size = 0.7
valid_size = 0.1
test_size = 0.2

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(data, target = 'target', 
                                                                            train_size=train_size, valid_size=valid_size, test_size=test_size)

# check the shape of the data
print('Training set shape: ', X_train.shape)
print('Validation set shape: ', X_valid.shape)
print('Test set shape: ', X_test.shape)

Training set shape:  (1079, 12)
Validation set shape:  (154, 12)
Test set shape:  (309, 12)


### Encode the target variable

In [5]:
from sklearn.preprocessing import LabelEncoder
# encode the target variable

# instantiate the label encoder
le = LabelEncoder()

# fit and transform the target variable
y_train = le.fit_transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

### Create ML Pipelines

In [10]:
# import required libraries
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define the text and numeric columns
numeric_features = ['VictAge','YEAR','MONTH','DAY','HOUR','REPORTING_DELAY']
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant',fill_value=0)),('scaler', StandardScaler())]
    )

categorical_features = ['AREANAME','VictSex','VictDescent','TIME_OF_DAY','WeaponDesc','PremisDesc']
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))
     ]
 )

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
     ]
)
preprocessor

In [11]:
clf = Pipeline(
    steps=[('preprocessor', preprocessor),('classifier', XGBClassifier())]
    )

# fit the model
clf.fit(X_train, y_train)
print('model score: %.3f' % clf.score(X_valid, y_valid))

model score: 0.513
