In [65]:
import pandas as pd
import numpy as np
import pickle
import re

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import dendrogram

from sentence_transformers import SentenceTransformer

from umap import UMAP
import plotly.express as px

In [2]:
def import_data():
    X_train = pd.read_csv('./train/X_train.csv')
    X_test = pd.read_csv('./test/X_test.csv')
    y_train = pd.read_csv('./train/y_train.csv')
    y_test = pd.read_csv('./test/y_test.csv')
    return X_train, X_test, y_train, y_test

In [3]:
X_train, X_test, y_train, y_test = import_data()

In [4]:
charges = X_train['c_charge_desc'].append(X_test['c_charge_desc']).dropna().unique()

In [5]:
charges

array(['arrest case no charge', 'Grand Theft in the 3rd Degree',
       'Felony Driving While Lic Suspd', 'Robbery W/Firearm', 'Battery',
       'Possession of Hydrocodone', 'Burglary With Assault/battery',
       'Possession of Cocaine', 'Possess Cannabis/20 Grams Or Less',
       'Susp Drivers Lic 1st Offense', 'Uttering Forged Credit Card',
       'Aggrav Battery w/Deadly Weapon', 'Purchase Of Cocaine',
       'Agg Battery Grt/Bod/Harm', 'Burglary Unoccupied Dwelling',
       'Aggravated Assault w/Firearm', 'Disorderly Intoxication',
       'Possession Of Amphetamine', 'Manufacture Cannabis',
       'DUI- Enhanced', 'Opert With Susp DL 2nd Offens',
       'Possession Of Alprazolam', 'Deliver Cannabis', 'Child Abuse',
       'Grand Theft (Motor Vehicle)', 'Retail Theft $300 1st Offense',
       'Driving Under The Influence', 'Leave Acc/Attend Veh/More $50',
       'Felony Petit Theft', 'Burglary Dwelling Occupied',
       'Aggravated Assault W/Dead Weap', 'Possession of Cannabis',
  

In [94]:
charges_clean=[(f'{i} ').lower()
.replace('w/o','without ')
.replace('/w','with ')
.replace('w/','with ')               
.replace('/',' ')
.replace('dl','drivers license')
.replace('driv ','drivers ')                
.replace(' lic ',' license ')               
.replace('priv ','private ')
.replace('deg ','degree ')
.replace('viol ','violence ')
.replace('veh ','vehicle ')
.replace('first','1st')
.replace('poss ','possession ')
.replace('bugl ','buglary ')
.replace('burgl ','buglary ')               
.replace('suspd','suspended')
.replace('aggrav ','aggravated ')               
.replace('agg ','aggravated ')               
.replace('susp ','suspended ')               
.replace('acc ','accident ')               
.replace('batt ','battery ') 
.replace('dang ','dangerous ')
.replace('weap ','weapon ')
.replace('persnl','personal')
.replace('inj ','injury ')
.replace('substa ','substance ')
.replace('int','intent ')
.replace('del ','deal ')
.replace('felo ','felon ')
.replace('sch ','school ')
.replace('consp ','conspiracy ')
.replace('traff ','traffick ')
.replace('>',' > ')               
.replace('<',' < ')                               
.replace('d.u.i.','driving under influence')                
.replace('dui','driving under influence')
.replace('d u i','driving under influence')  
.strip() for i in charges]

charges_clean = [re.sub(r'(?<=\d)g', ' grams ', i) for i in charges_clean]
charges_clean = [re.sub(r'(?<=\d)ft', ' ft ', i) for i in charges_clean]
charges_clean = [re.sub(r'(?<=\d)yr', ' years ', i) for i in charges_clean]

In [95]:
charges_clean

['arrest case no charge',
 'grand theft in the 3rd degree',
 'felony driving while license suspended',
 'robbery with firearm',
 'battery',
 'possession of hydrocodone',
 'burglary with assault battery',
 'possession of cocaine',
 'possess cannabis 20 grams or less',
 'suspended drivers license 1st offense',
 'uttering forged credit card',
 'aggravated battery with deadrivers licensey weapon',
 'purchase of cocaine',
 'aggravated battery grt bod harm',
 'burglary unoccupied dwelling',
 'aggravated assault with firearm',
 'disorderly intent oxication',
 'possession of amphetamine',
 'manufacture cannabis',
 'driving under influence- enhanced',
 'opert with suspended drivers license 2nd offens',
 'possession of alprazolam',
 'deliver cannabis',
 'child abuse',
 'grand theft (motor vehicle)',
 'retail theft $300 1st offense',
 'driving under the influence',
 'leave accident attend vehicle more $50',
 'felony petit theft',
 'burglary dwelling occupied',
 'aggravated assault with dead weapo

In [96]:
robert = SentenceTransformer('stsb-roberta-large') 

In [97]:
embeddings = pd.DataFrame(np.vstack(np.array([robert.encode(i) for i in charges_clean])))

In [98]:
def cluster_plot(model):
    
    clust = model
    clust.fit(embeddings)
    
    umap = UMAP(metric='cosine',random_state=42)
    umap.fit(embeddings)
    
    data = pd.DataFrame(umap.transform(embeddings),columns=["x","y"]).assign(text=charges).assign(color=clust.labels_)
    
    fig = px.scatter(data,x='x',y='y',hover_data={'b':data['text']},color=data['color'])
    fig.update_layout(margin=dict(l=0,r=0,b=0,t=0))
    fig.update_layout(hoverlabel={'font_size':12,'bgcolor':'black'})
    fig.update_traces(marker={'size':4})
    
    return model,fig

In [99]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [105]:
model = KMeans(n_clusters=18)
kme,fig = cluster_plot(model)
fig

In [106]:
#model = DBSCAN(metric='cosine',eps=.3,min_samples=3)
#dbs,fig = cluster_plot(model)
#fig

In [107]:
#model = AgglomerativeClustering(n_clusters=20,affinity='cosine',linkage='complete')
#agg,fig = cluster_plot(model)
#fig

In [108]:
charge_map = {i:j for i,j in zip(charges,kme.labels_)}
charge_map['arrest case no charge']=-1
pickle.dump(charge_map, open('charge_map_v3.pkl', 'wb')) 

In [109]:
X_train['c_charge_desc'].map(charge_map).value_counts()

 11.0    790
-1.0     620
 4.0     327
 16.0    313
 15.0    313
 5.0     304
 3.0     273
 14.0    143
 2.0     121
 8.0      99
 0.0      82
 6.0      82
 9.0      75
 13.0     68
 7.0      64
 17.0     34
 1.0      29
 10.0     25
 12.0     20
Name: c_charge_desc, dtype: int64