### Import required libraries

In [1]:
# start with common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read in the data

In [2]:
data = pd.read_csv('../data/homicide.csv')

data.shape

(1542, 13)

### Natural Language Processing
We will execute a few techniques with the text data to make it ready for the model. We will use the following techniques:
- Tokenization - converting text into tokens
- Removing Stopwords - removing common words that will likely appear in any text
- Lemmatization - converting words to their base form
- Stemming - reducing words to their root form
- n-grams - grouping words together (for example, instead of having "good" and "movie" as separate tokens, we can have "good movie" as one token)

### First we will create custom transformers to perform these tasks

### Split the data into training, validation and test sets

In [3]:
from fast_ml.model_development import train_valid_test_split

# Split the data
X = data.drop('target', axis=1)
y = data['target']

# define the train, validation and test size
train_size = 0.7
valid_size = 0.1
test_size = 0.2

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(data, target = 'target', 
                                                                            train_size=train_size, valid_size=valid_size, test_size=test_size)

# check the shape of the data
print('Training set shape: ', X_train.shape)
print('Validation set shape: ', X_valid.shape)
print('Test set shape: ', X_test.shape)

Training set shape:  (1079, 12)
Validation set shape:  (154, 12)
Test set shape:  (309, 12)


### Encode the target variable

In [4]:
from sklearn.preprocessing import LabelEncoder
# encode the target variable

# instantiate the label encoder
le = LabelEncoder()

# fit and transform the target variable
y_train = le.fit_transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

### Create ML Pipelines

### Create custom transformers for the text data

In [5]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class NltkPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def tokenize_and_stem(self, text):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        stems = [self.stemmer.stem(token) for token in tokens]
        return ' '.join(stems)

    def transform(self, X, y=None):
        # Convert X to a pandas Series if it's a numpy array
        if isinstance(X, np.ndarray):
            X = pd.Series(X.flatten())

        return X.apply(self.tokenize_and_stem)

In [6]:
# import required libraries
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier


# define the categorical preprocessor
cat = ['AREANAME', 'VictSex', 'TIME_OF_DAY','VictDescent']
cat_transforms = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]

)

# define the numeric preprocessor
num = ['YEAR', 'MONTH', 'DAY', 'HOUR','REPORTING_DELAY']
num_transforms = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler())
    ]
)

# define the text preprocessor
txt1 = ['PremisDesc']
txt1_transforms = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='')),
        ('preprocessor', NltkPreprocessor()),
        ('vectorizer', CountVectorizer(max_features=200,
                                        min_df=0.001
                                        ))
    ]
)

txt2 = ['WeaponDesc']
txt2_transforms = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='')),
        ('preprocessor', NltkPreprocessor()),
        ('vectorizer', CountVectorizer(max_features=200,
                                        min_df=0.001
                                        ))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transforms, num),
        ('cat', cat_transforms, cat),
        ('txt1', txt1_transforms, ['PremisDesc']),
        ('txt2', txt2_transforms, ['WeaponDesc'])
    ]
)

# Now you can fit the preprocessor
preprocessor.fit(X_train)

### Fit the model

In [7]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(n_estimators=100, random_state=0))
    ]
)
# find the best parameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=1, verbose=3)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END classifier__max_depth=3, classifier__n_estimators=100;, score=0.565 total time=   0.2s
[CV 2/5] END classifier__max_depth=3, classifier__n_estimators=100;, score=0.597 total time=   0.3s
[CV 3/5] END classifier__max_depth=3, classifier__n_estimators=100;, score=0.556 total time=   0.2s
[CV 4/5] END classifier__max_depth=3, classifier__n_estimators=100;, score=0.593 total time=   0.2s
[CV 5/5] END classifier__max_depth=3, classifier__n_estimators=100;, score=0.623 total time=   0.3s
[CV 1/5] END classifier__max_depth=3, classifier__n_estimators=200;, score=0.597 total time=   0.3s
[CV 2/5] END classifier__max_depth=3, classifier__n_estimators=200;, score=0.611 total time=   0.2s
[CV 3/5] END classifier__max_depth=3, classifier__n_estimators=200;, score=0.542 total time=   0.2s
[CV 4/5] END classifier__max_depth=3, classifier__n_estimators=200;, score=0.583 total time=   0.2s
[CV 5/5] END classifier__max_depth=3, cl

In [8]:
# fit the model with the best parameters
clf.set_params(**grid_search.best_params_)

clf.fit(X_train, y_train)

# predict the target variable
y_train_pred = clf.predict(X_train)
y_valid_pred = clf.predict(X_valid)
y_test_pred = clf.predict(X_test)

In [9]:
# evaluate the model
from sklearn.metrics import accuracy_score

display('Training accuracy: %.2F' %  accuracy_score(y_train, y_train_pred))
display('Validation accuracy: %.2F' %  accuracy_score(y_valid, y_valid_pred))
display('Test accuracy: %.2F' %  accuracy_score(y_test, y_test_pred))

'Training accuracy: 0.91'

'Validation accuracy: 0.54'

'Test accuracy: 0.56'

In [10]:
# get precision, recall and f1 score
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.45      0.40      0.42       125
           1       0.62      0.66      0.64       184

    accuracy                           0.56       309
   macro avg       0.53      0.53      0.53       309
weighted avg       0.55      0.56      0.55       309

