In [24]:
from re import sub
import pandas as pd
from nltk import word_tokenize
from gensim import corpora, models
from wordcloud import WordCloud, STOPWORDS 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
pd.options.display.max_rows = 1000

In [3]:
data = pd.read_csv('/Users/wasilaq/pbf-analysis/Data/0c_distinct_dockets.csv')
data.head()

Unnamed: 0,id,age,address,docket_number,filing_date,charge,represented_by,bail_type,bail_status,bail_amount,outstanding_bail_amount
0,3909,27.0,"Philadelphia, PA 19141",MC-51-CR-0011746-2020,2020-06-16T00:37:00Z,DUI: Gen Imp/Inc of Driving Safely - 1st Off,Defender Association of Philadelphia,Posted,ROR,0,0
1,4538,44.0,"Philadelphia, PA 19124",MC-51-CR-0011747-2020,2020-06-16T00:41:00Z,Verify Address or Photographed as Required,Defender Association of Philadelphia,Set,Monetary,50000,0
2,120,24.0,"Philadelphia, PA 19142",MC-51-CR-0011743-2020,2020-06-16T00:52:00Z,Criminal Mischief,Defender Association of Philadelphia,Posted,ROR,0,0
3,120,24.0,"Philadelphia, PA 19142",MC-51-CR-0011744-2020,2020-06-16T00:52:00Z,Criminal Mischief,Defender Association of Philadelphia,Posted,ROR,0,0
4,120,24.0,"Philadelphia, PA 19142",MC-51-CR-0011745-2020,2020-06-16T00:52:00Z,Criminal Mischief,Defender Association of Philadelphia,Posted,ROR,0,0


## Explore Charges

In [4]:
data.charge.nunique()

136

In [5]:
data.charge.value_counts()

Manufacture, Delivery, or Possession With Intent to Manufacture or Deliver                    836
Burglary - Not Adapted for Overnight Accommodation, No Person Present                         580
Aggravated Assault - Attempts to cause SBI or causes injury with extreme indifference         537
DUI: Gen Imp/Inc of Driving Safely - 1st Off                                                  344
Simple Assault                                                                                258
Firearms Not To Be Carried W/O License                                                        238
Theft By Unlaw Taking-Movable Prop                                                            236
Contempt For Violation of Order or Agreement                                                  200
Int Poss Contr Subst By Per Not Reg                                                           200
Burglary - Overnight Accommodations Person Present, Bodily Injury Crime                       199
Poss Instrument Of C

### Top Charges

"Manufacture, Delivery, or Possession With Intent to Manufacture or Deliver"
<br> **drug dealing**
<br> http://www.phillybestdefense.com/possession-with-intent-to-deliver-manufacturing

"Burglary - Not Adapted for Overnight Accommodation, No Person Present"
<br> **burglary**
<br>https://www.pittsburghcriminalattorney.com/theft/burglary/ ('*However, if you were caught in a building where no one was present and that was not adapted for overnight accommodation, you can be charged with a felony of the second degree.*')

"Aggravated Assault - Attempts to cause SBI or causes injury with extreme indifference"
<br> **assault**
<br> https://www.legis.state.pa.us/cfdocs/legis/LI/consCheck.cfm?txtType=HTM&ttl=18&div=0&chpt=27&sctn=2&subsctn=0

"DUI: Gen Imp/Inc of Driving Safely - 1st Off"
<br> **DUI 1st offense**

"Simple Assault"
<br> **assault**
<br>https://www.legis.state.pa.us/cfdocs/legis/LI/consCheck.cfm?txtType=HTM&ttl=18&div=0&chpt=27&sctn=1&subsctn=0

## Charges Topic Modeling

potentially use category and/or severity of charge as a feature

https://www.kdnuggets.com/2019/09/overview-topics-extraction-python-latent-dirichlet-allocation.html

Supervised: https://www.freecodecamp.org/news/how-we-changed-unsupervised-lda-to-semi-supervised-guidedlda-e36a95f3a164/

CorEx: https://gist.github.com/patrickvankessel/0d5bd690910edece831dbdf32fb2fb2d

In [6]:
charges = data.charge.unique()

In [7]:
def clean_text(text):
    '''
    remove punctuation and stop words, lowercase
    '''
    no_line_breaks = text.replace('\n','')
    lowercase = no_line_breaks.lower()
    no_punc = sub(r'[^\w\s]', '', lowercase)
    final = no_punc
    
    return final

In [8]:
charges = [clean_text(charge) for charge in charges]

In [17]:
charges_corpus = [word_tokenize(charge) for charge in charges]

charges_corpus

[['dui', 'gen', 'impinc', 'of', 'driving', 'safely', '1st', 'off'],
 ['verify', 'address', 'or', 'photographed', 'as', 'required'],
 ['criminal', 'mischief'],
 ['contempt', 'for', 'violation', 'of', 'order', 'or', 'agreement'],
 ['burglary',
  'overnight',
  'accommodations',
  'person',
  'present',
  'bodily',
  'injury',
  'crime'],
 ['simple', 'assault'],
 ['poss', 'instrument', 'of', 'crime', 'wint'],
 ['firearms', 'not', 'to', 'be', 'carried', 'wo', 'license'],
 ['theft', 'by', 'unlaw', 'takingmovable', 'prop'],
 ['crim', 'tresbreak', 'into', 'structure'],
 ['manufacture',
  'delivery',
  'or',
  'possession',
  'with',
  'intent',
  'to',
  'manufacture',
  'or',
  'deliver'],
 ['rape', 'forcible', 'compulsion'],
 ['indec', 'assltwo', 'cons', 'of', 'other'],
 ['aggravated',
  'assault',
  'attempts',
  'to',
  'cause',
  'sbi',
  'or',
  'causes',
  'injury',
  'with',
  'extreme',
  'indifference'],
 ['receiving', 'stolen', 'property'],
 ['int', 'poss', 'contr', 'subst', 'by', 

In [18]:
for charge in charges_corpus:
    for word in charge:
        if word in STOPWORDS:
            charge.remove(word)

In [19]:
charges_corpus

[['dui', 'gen', 'impinc', 'driving', 'safely', '1st'],
 ['verify', 'address', 'photographed', 'required'],
 ['criminal', 'mischief'],
 ['contempt', 'violation', 'order', 'agreement'],
 ['burglary',
  'overnight',
  'accommodations',
  'person',
  'present',
  'bodily',
  'injury',
  'crime'],
 ['simple', 'assault'],
 ['poss', 'instrument', 'crime', 'wint'],
 ['firearms', 'to', 'carried', 'wo', 'license'],
 ['theft', 'unlaw', 'takingmovable', 'prop'],
 ['crim', 'tresbreak', 'structure'],
 ['manufacture', 'delivery', 'possession', 'intent', 'manufacture', 'deliver'],
 ['rape', 'forcible', 'compulsion'],
 ['indec', 'assltwo', 'cons', 'other'],
 ['aggravated',
  'assault',
  'attempts',
  'cause',
  'sbi',
  'causes',
  'injury',
  'extreme',
  'indifference'],
 ['receiving', 'stolen', 'property'],
 ['int', 'poss', 'contr', 'subst', 'per', 'reg'],
 ['tamper', 'public', 'recordinformation'],
 ['robberyinflict', 'serious', 'bodily', 'injury'],
 ['purcrec', 'cont', 'substby', 'unauth', 'per']

In [20]:
dictionary_LDA = corpora.Dictionary(charges_corpus)
# dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(charge) for charge in charges_corpus]

num_topics = 4
lda_model = models.LdaModel(corpus, num_topics=num_topics,
                            id2word=dictionary_LDA,
                            passes=4,
                            alpha=[0.01]*num_topics,
                            eta=[0.01]*len(dictionary_LDA.keys()))

In [21]:
# print topics

for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

0: 0.031*"manufacture" + 0.028*"assault" + 0.026*"bodily" + 0.024*"possession" + 0.019*"statutory" + 0.019*"sexual" + 0.018*"injury" + 0.017*"inj" + 0.017*"gen" + 0.017*"dui"

1: 0.046*"theft" + 0.045*"person" + 0.037*"attempt" + 0.037*"criminal" + 0.033*"overnight" + 0.033*"present" + 0.033*"burglary" + 0.023*"bodily" + 0.023*"injury" + 0.022*"accommodation"

2: 0.033*"criminal" + 0.025*"structure" + 0.025*"attempt" + 0.025*"crim" + 0.021*"assault" + 0.018*"sexual" + 0.014*"obt" + 0.014*"to" + 0.014*"degree" + 0.014*"wo"

3: 0.069*"criminal" + 0.044*"attempt" + 0.042*"vehicle" + 0.037*"forcible" + 0.037*"compulsion" + 0.029*"assault" + 0.023*"motor" + 0.023*"aggravated" + 0.019*"enforcement" + 0.019*"idsi"



In [22]:
# visualize LDA results

%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [36]:
def charge_sentiment(index):
    charge = charges[index]
    cleaned_charge = charges_corpus[index]
    text = ' '.join(cleaned_charge)
    polarity = SentimentIntensityAnalyzer().polarity_scores(text)
    print(charge)
    return polarity

In [37]:
charge_sentiment(10)

# drug dealing

manufacture delivery or possession with intent to manufacture or deliver


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

## Wordclouds

pre2000_wordcloud = WordCloud(
    width = 800, height = 800, background_color ='white', stopwords = set(STOPWORDS), min_font_size = 10
).generate(pre2000_words) 

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(pre2000_wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0)
  
plt.show()