## Import Libraries

In [1]:
import pandas as pd
import regex as re
import numpy as np
import logging

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import mplcyberpunk
%config InlineBackend.figure_formats = ['svg']

# ML Modeling
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay,roc_curve
from sklearn.preprocessing import normalize
# Saving and importing trained models
import pickle

Read through Cleaned Dataset

In [2]:
df = pd.read_csv("data/processed-dataset.csv")

hispanic_filter = df['race'] == "HISPANIC/LATIN/MEXICAN"
hispanic_df = df[hispanic_filter].sort_values(by = ['arrest_date', 'arrest_time'], ascending= [True, True])
hispanic_df.head(10)

  df = pd.read_csv("data/processed-dataset.csv")


Unnamed: 0,arrest_unique_id,incident_number,pin,age,sex,race,ethnicity,arrest_date,arrest_time,location_of_arrest_in_block,arrest_reason,arrest_type,summary_of_facts,arrest_officer,officer_name,current_status,young_offender
56511,SJ201519,SJ2015150010241,441762891,19.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,415.0,2900 BLOCK OCALA CT,CRIMINAL CITATION,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,TRESPASSING (CITE),4222,MACIAS,CITED,N
56522,SJ201519,SJ2015J3347648 C,441762891,19.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,415.0,2900 BLOCK OCALA CT,CRIMINAL CITATION,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,TRESPASSING (CITE),4222,MACIAS,CITED,N
51970,SJ20157,SJ2015150010272,441776037,23.0,F,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,530.0,5100 BLOCK MONTEREY RD,ON VIEW,ON VIEW ARREST,INFLICT CORP INJ ON SPOUSE/COHAB (ONVW),4348,LEE,CHARGED/BOOKED,N
94333,SJ2015190,SJ2015150010482,441915772,33.0,F,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1400.0,2200 BLOCK TULLY RD,CRIMINAL CITATION,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,PETTY THEFT OF PERSONAL PROPERTY/LABOR/EMBEZZL...,3153,TOMPKINS,CITED,N
94339,SJ2015190,SJ2015J3337366 C,441915772,33.0,F,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1400.0,2200 BLOCK TULLY RD,CRIMINAL CITATION,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,PETTY THEFT OF PERSONAL PROPERTY/LABOR/EMBEZZL...,3153,TOMPKINS,CITED,N
3262,SJ2015337,SJ2015J3354478 C,441916123,23.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1605.0,CURTNER AV / 87,CRIMINAL CITATION,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,FAILING TO PROVIDE EVIDENCE OF FINANCIAL RESPO...,4280,ARANA,CITED,N
96726,SJ201529,SJ2015150010655,441915179,22.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1655.0,S 1ST ST / W ALMA AV,ON VIEW,ON VIEW ARREST,"RESISTING, DELAYING, OBSTRUCTING AN OFFICER (O...",4245,VALOSEK,CHARGED/BOOKED,N
14755,SJ201525,SJ2015150010664,51315201,59.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1756.0,MERIDIAN AV / DOUGLAS ST,LOCAL BENCH WARRANT,TAKEN INTO CUSTODY/WARRANT ARREST ONLY,POSSESSION OF CONTROLLED SUBSTANCE PARAPHERNAL...,4152,BYERS,CHARGED/BOOKED,N
91122,SJ201530,SJ2015150010664,441729075,59.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1756.0,300 BLOCK MERIDIAN AV,ON VIEW,ON VIEW ARREST,POSSESSION CONTROLLED SUBSTANCE (ONVW),3773,SOLOMON,CHARGED/BOOKED,N
65769,SJ201514,SJ2015150010728,441761283,23.0,M,HISPANIC/LATIN/MEXICAN,HISPANIC/LATIN/MEXICAN,01/01/15,1831.0,5600 BLOCK COTTLE RD,ON VIEW,ON VIEW ARREST,ROBBERY (ONVW),4240,MORGAN II,CHARGED/BOOKED,N


## 3. Process the Data for Classification

Some of the fields will be useful to use for the classification task.

1. `location_of_arrests_in_block`: Since we're writing a model to predict hotspots, this column is integral to our goal.
2. `summary_of_facts`: This column includes a short descrtiption of the arrest, which will provide potentially helpful contextual information to make a better model for our race prediction. We'll see!
3. `race`: This column includes information too! Different races of people often capture how demographics are in a certain area. So, the race may also provide potentially helpful contextual information to make a better model. Again, we'll see!

The code below creates 3 new columns for that task.

There's a lot to unpack below, but it basically

2. it creates 3 new columns that are combined with the available data.

In [3]:
#just the description
df['arrest_desc'] = df['summary_of_facts']

#description + location_of_arrest_in_block
df['arrest_desc_location'] = df['summary_of_facts'] + ' '+ df['location_of_arrest_in_block']

#description + location_of_arrest_in_block + race
df['arrest_desc_location_race'] = df['summary_of_facts'] + ' '+ df['location_of_arrest_in_block']+" " + df['race']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153434 entries, 0 to 153433
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   arrest_unique_id             153434 non-null  object
 1   incident_number              153434 non-null  object
 2   pin                          153434 non-null  int64 
 3   age                          153434 non-null  object
 4   sex                          153434 non-null  object
 5   race                         153434 non-null  object
 6   ethnicity                    153434 non-null  object
 7   arrest_date                  153434 non-null  object
 8   arrest_time                  153434 non-null  object
 9   location_of_arrest_in_block  153434 non-null  object
 10  arrest_reason                153434 non-null  object
 11  arrest_type                  153434 non-null  object
 12  summary_of_facts             153434 non-null  object
 13  arrest_officer

## 4. Train a Logistic Regression Model

### 4.3 Modeling functions

The functions below help us create a systematic and reproducable workflow to train the data.

Be sure to check out my videos that walk through an overview of what they do.

In [4]:
def _reciprocal_rank(true_genre_labels: list, machine_predicted_genre_labels: list):
    '''
    ## Purpose
    Compute the reciprocal rank at cutoff k

    ## Parameters
        - `true_genre_labels` (List): List of actual news genre labels
        - `machine_predicted_genre_labels` (List): List of news genre labels predicted by the LR algorithm
    
    ## Return Values
        - `recip_rank` (Float): Reciprocal rank
    '''
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_predicted_genre_labels) if r in true_genre_labels]

    recip_rank = 0
    if len(tp_pos_list) > 0:
        # for reciprocal rank we must find the position of the first **correctly labeled** item
        first_pos_list = tp_pos_list[0]
        
        # recip_rank = 1/rank
        recip_rank = 1 / float(first_pos_list)

    return recip_rank

def compute_mrr_at_k(eval_news_category_items:list):
    '''
    ## Purpose
    `compute_mrr_at_k()`: Computes the MRR (average RR) at cutoff k. In sum, it takes the mean average of all of the reciprocal rank scores among the actual vs. predicted labels. Review this ["Mean reciprocal rank" wikipedia article](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for a simple explainer.
    ## Parameters
    - `eval_news_category_items` (List): List that contains 2 values
        1. String - Actual news genre category
        2. List of strings - Predicted news genre category in order by estimated probability to be returned by the model.
            - The example below shows how 
                - `'HEALTHY LIVING'` was the actual label, but it was third in 'reciprocal rank' with a value of 1/3
                - `'WORLDPOST'` was the actual label, and it was first in 'reciprocal rank' with a value of 1
                
                [
                    [
                        ['HEALTHY LIVING'], ['POLITICS', 'ENTERTAINMENT', 'HEALTHY LIVING']
                    ], 
                    [
                        ['WORLDPOST'], ['WORLDPOST', 'MEDIA', 'POLITICS']
                    ], 
                    ...
                ]

    ## Return Values
        - `mean_reciprocal_rank_score` (Float): Mean average reciprocal rank score among the predicted news category in the model
    '''
    rr_total = 0
    
    for item in eval_news_category_items:
        actual_label = item[0]
        pred_label_list = item[1]

        # Find the reciprocal rank (RR) for this row
        rr_at_k = _reciprocal_rank(actual_label, pred_label_list)

        # Add the row's RR to the accruing scores for the entire corpus
        rr_total = rr_total + rr_at_k

        # Update the Mean Reciprocal Rank (MRR) score with new row value
        mean_reciprocal_rank_score = rr_total / 1/float(len(eval_news_category_items))

    return mean_reciprocal_rank_score

def collect_preds(Y_test, Y_preds):
    '''
    ## Purpose
    Collect all predictions (predicted news genre labels) and ground truth (i.e., actual news genre label)
    '''
    pred_gold_list = [ [ [Y_test[index]], pred ] for index, pred in enumerate(Y_preds) ]
    return pred_gold_list
             
def compute_accuracy(eval_news_category_items:list):
    '''
    ## Purpose
    `compute_accuracy()`: Compute the overall accuracy score of the model across the training corpus

    ## Parameters
        - `eval_news_category_items` (List): List that contains 2 values
            1. String - Actual news genre category
            2. List of strings - Predicted news genre category

            Example: [
                [
                    ['HEALTHY LIVING'], ['POLITICS', 'ENTERTAINMENT', 'HEALTHY LIVING']
                ], 
                [
                    ['WORLDPOST'], ['WORLDPOST', 'MEDIA', 'POLITICS']
                ], 
                ...
            ]
    ## Return Values
        - `news_cat_prediction_accuracy` (Float): Percentage of accurately predicted news category in the model
    '''
    correct_news_cat = 0
    
    for news_genre_cat in eval_news_category_items:
        true_pred = news_genre_cat[0]
        machine_pred = set(news_genre_cat[1])
        
        for news_cat in true_pred:
            if news_cat in machine_pred:
                correct_news_cat += 1
                break
    
    news_cat_prediction_accuracy = correct_news_cat / float(len(eval_news_category_items))
    return news_cat_prediction_accuracy



In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(df, field, training_data, testing_data, type='binary'):
    '''
    ## Purpose
    `extract_features()`: Extract features using different method types: binary, counts, or TF-IDF

    ### If BINARY Features
    Creates a new `CountVectorizer()` method object, which converts a collection of text documents to a matrix of binary token counts per document. In other words, 
    - `1` == the feature is represented in the document
    - `0` == the feature is not represented in the doc
    
    Logistic regression involves vectorizing weighted averages of these tokens.

    ### If COUNT Features
    Creates a new `CountVectorizer()` method object, which converts a collection of text documents to a matrix of `n` token counts per document.  In other words, 
    - `5` == the feature is represented 5 times in the document
    - `25` == the feature is represented 25 times in the document
    - `0` == the feature is not represented in the doc
    
    Logistic regression involves vectorizing weighted averages of these tokens.

    ### If TF-IDF Features
    Creates a new `CountVectorizer()` method object, which converts a collection of text documents to a matrix of `n` token counts per document.  In other words, 
    - `5` == the feature is represented 5 times in the document
    - `25` == the feature is represented 25 times in the document
    - `0` == the feature is not represented in the doc
    
    Logistic regression involves vectorizing weighted averages of these tokens.
    '''
    
    logging.info("Extracting features and creating vocabulary...")

    '''
        BINARY and COUNTS PROCESSES WILL DO THE FOLLOWING:

        sklearn's CountVectorizer() will convert text to numerical data.
    '''
    
    if 'binary' in type:
        
        # BINARY FEATURE REPRESENTATION
        # Creates a new CountVectorizer() method object, which can help us use built-in functions that convert a collection of text documents to a matrix of token counts. **REMEMBER** that logistic regression involves vectorizing weighted averages of these tokens.
        # NOTE: `max_df` == "Maximum Document Frequency. It enables us to programmatically ignore frequently occuring words, e.g., articles like 'a' or 'the'. `max_df` reviews how many documents contain the word, and if it exceeds the max_df threshold then it is eliminated from the sparse matrix. Below we set the threshold to 95%.
        cv = CountVectorizer(binary=True, max_df=0.95)
        # CountVectorizer()'s fit_transform() uses the training_data to learn the vocabulary dictionary and return document-term matrix.
        cv.fit_transform(training_data[field].values)
        # CountVectorizer()'s transform() 
        train_feature_set = cv.transform(training_data[field].values)
        test_feature_set = cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif 'counts' in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv = CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set = cv.transform(training_data[field].values)
        test_feature_set = cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    elif 'tfidf':    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

def get_top_k_predictions(model, X_test, k, threshold=False):
    '''
    ## Purpose
    `get_top_k_predictions()`: Uses the input trained LogisticRegression model to return the news genre class/category with the top estimated probability score.
    ## Parameters
    - `model` (LogisticRegression()): Trained model scikit-learn object
    - `X_test` (pandas DataFrame): Sampled test data set returned by `training_test_split()` in the `training_model()` function
    - `k` (Integer): Number of top categories (news genres) to return based on the estimated probability to predict the news genre
    ## Return Value(s)
    - `preds` (List of list): A list within a list of the top k retruned news categories. For example:
        - `preds` is `[['SCIENCE', 'HEALTHY LIVING', 'GREEN']]` for an article with the headline of `"Exercise in space keeps astronauts from fainting when they return to Earth, study says"` and `k=3`
    '''
    if threshold == False:
        # get probabilities instead of predicted labels, since we want to collect top 3
        probs = model.predict_proba(X_test)

        # GET TOP K PREDICTIONS BY PROB - note these are just index
        best_n = np.argsort(probs, axis=1)[:,-k:]
        
        # GET CATEGORY OF PREDICTIONS
        preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
        
        preds = [ item[::-1] for item in preds]
    
        return preds
    else:
        # get probabilities instead of predicted labels, since we want to collect top 3
        probs = (model.predict_proba(X_test)[:,1] >= thresh_val)

        # GET TOP K PREDICTIONS BY PROB - note these are just index
        best_n = np.argsort(probs, axis=1)[:,-k:]
        
        # GET CATEGORY OF PREDICTIONS
        preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
        
        preds = [ item[::-1] for item in preds]
    
        return preds
   
def train_model(df, field="arrest_desc", feature_rep="binary", top_k=3):
    '''
    ## Purpose
    train_model() is the main controller function that conducts the following modeling procedure: 
        
    1. Create X data (List) by splitting the data to create two sampled sets: 1) for training, and 2) for testing.
    2. Create Y data (List) by assigning the actual (ground truth) labels
    3. Extract the features for the model to use, based on the chosen feature representation: binary vs. TF-IDF
    4. Fit, i.e., train, the logistic regression classifier model with scikit-learn's `LogisticRegression()` object
    5. Retrieve the evaluation items, e.g., the actual labels (ground truths) and predicted labels (list of top `k` number of estimated probable predicted categories)
    6. Use the evaluation iitems to compute the overall accuracy score and mean reciprocal rank score of the model

    ## Parameters
    - `df` (pandas DataFrame): the complete data set / corpus
    - `field` (String): the column name of the feature used to train the model
    - `feature_rep` (String): Type of LR analysis set as either "binary" or "count" or "tfidf"
    '''
    
    logging.info("Starting model training...")
    
    # 1. GET A TRAIN TEST SPLIT (set seed for consistent results)
    # train_test_split() from sklearn "splits arrays or matrices into random train and test subsets."
    # returns 2 new dataframes: one for training, another for testing the trained model
    y = df['location_of_arrest_in_block']
    x_training_data,x_testing_data = train_test_split(
        df,
        random_state=2000 #Controls the shuffling applied to the data before applying the split
    )

    # 2. GET LABELS FROM SPLIT DATA
    # Get the category values from each split data returned by #1
    Y_train = x_training_data['location_of_arrest_in_block'].values
    Y_test = x_testing_data['location_of_arrest_in_block'].values
     
    # 3. GET FEATURES
    X_train,X_test,feature_transformer = extract_features(
        df,
        field,
        x_training_data,
        x_testing_data,
        type=feature_rep
    )

    # INITIALIZE THE LOGISTIC REGRESSION CLASSIFIER OBJECT
    logging.info("Training a Logistic Regression Model. This may take a few minutes. ...")
    scikit_log_reg = LogisticRegression(
        verbose=1, #if you want the LR method to print out all the details, change this 0 to 1
        solver='liblinear',
        random_state=0,
        C=5,
        penalty='l2',
        max_iter=1000
    )
    # Create the model by providing the LR object the 
    model = scikit_log_reg.fit(X_train, Y_train)

    # GET TOP K PREDICTIONS
    preds = get_top_k_predictions(model, X_test, top_k)
    
    # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation
    eval_items = collect_preds(Y_test, preds)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    simple_mean_avg_correct_prediction_accuracy = compute_accuracy(eval_items)
    mean_recip_rank_at_k = compute_mrr_at_k(eval_items)
    
    logging.info("Done training and evaluation.")

    # Return the herein computed model and other values for potential use and exploration
    return model,feature_transformer,simple_mean_avg_correct_prediction_accuracy,mean_recip_rank_at_k,X_train,X_test,Y_test,Y_train,preds,eval_items


### 4.4 LR Model 1 - Binary or Count features with `arrest_desc` only

#### 4.4.1 Enact the Training

In [6]:
'''
  Parameters to configure for our train_model() function
'''

# Use the short description only to train a model
training_field = 'arrest_desc_location'
# Specify if this model should use a binary approach to the features (0 or 1) or the actual counts created by CountVectorizer()
feature_rep = 'binary'
# Tell the model function to return the top 3 'best fits' among the distributed probabilities
top_k = 3

# Train that supervised ML logistic regression model!
"""model_td_only,transformer_td_only,accuracy_td_only,mrr_at_k_td_only,X_train,X_test,Y_test,Y_train,preds,eval_items = train_model(
  df, # full corpus
  field=training_field,
  feature_rep=feature_rep,
  top_k=top_k
)"""
binary_model_st,binary_transformer_st,binary_accuracy_st,binary_mrr_at_k_st,binary_X_train,binary_X_test,binary_Y_test,binary_Y_train,binary_preds,binary_eval_items = train_model(
  df, # full corpus
  field=training_field,
  feature_rep=feature_rep,
  top_k=top_k
)

2023-12-12 23:46:13,482 : INFO : Starting model training...
2023-12-12 23:46:13,511 : INFO : Extracting features and creating vocabulary...
2023-12-12 23:46:14,434 : INFO : Training a Logistic Regression Model. This may take a few minutes. ...


[LibLinear]iter  1 act 3.197e+05 pre 2.812e+05 delta 1.575e+00 f 3.988e+05 |g| 4.138e+05 CG   1
iter  2 act 5.174e+04 pre 4.140e+04 delta 1.575e+00 f 7.907e+04 |g| 1.018e+05 CG   1
iter  3 act 1.737e+04 pre 1.378e+04 delta 1.575e+00 f 2.733e+04 |g| 3.602e+04 CG   1
iter  4 act 6.250e+03 pre 4.947e+03 delta 1.575e+00 f 9.957e+03 |g| 1.312e+04 CG   1
iter  5 act 2.295e+03 pre 1.815e+03 delta 1.575e+00 f 3.707e+03 |g| 4.820e+03 CG   1
iter  6 act 8.463e+02 pre 6.696e+02 delta 1.575e+00 f 1.412e+03 |g| 1.775e+03 CG   1
iter  7 act 3.092e+02 pre 2.452e+02 delta 1.575e+00 f 5.654e+02 |g| 6.520e+02 CG   1
iter  8 act 1.095e+02 pre 8.734e+01 delta 1.575e+00 f 2.562e+02 |g| 2.375e+02 CG   1
cg reaches trust region boundary
iter  9 act 4.862e+01 pre 3.898e+01 delta 2.186e+00 f 1.466e+02 |g| 8.473e+01 CG   2
cg reaches trust region boundary
iter 10 act 2.438e+01 pre 2.076e+01 delta 3.037e+00 f 9.802e+01 |g| 3.384e+01 CG   3
iter 11 act 6.788e+00 pre 8.051e+00 delta 3.037e+00 f 7.364e+01 |g| 1.524

#### 4.4.2 Test the accuracy/performance of the model

##### 4.4.2.1 See the accuracy and Mean Reciprocal Rank Scores

In [None]:
print(f"Overall Mean Average Model Accuracy = {binary_accuracy_st}\nMean Reciprocal Rank = {binary_mrr_at_k_st}")