## Supreme court topic modeling

this notebook is the same as the previous but it takes all SC cases rather than just the recent 60 years

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
import pickle
full = pd.read_pickle('SCtext.pickle')

In [3]:
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23435 entries, 0 to 23434
Data columns (total 5 columns):
case_url     23435 non-null object
docket       23435 non-null object
year         23435 non-null object
fulltext     23435 non-null object
case_name    23435 non-null object
dtypes: object(5)
memory usage: 915.5+ KB


In [4]:
import re
def  stripfootnotes(text):
    if re.search("FOOTNOTES", text):
        textlist = text.split("FOOTNOTES")
        return textlist[0]
    else:
        return text
full["fulltext"] = full.fulltext.apply(stripfootnotes)

In [5]:
full.year = pd.to_numeric(full.year)

In [13]:
'''mask=full['year']>1953
#check names
modern = full.loc[mask]
'''

In [16]:
modern.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10693 entries, 0 to 23434
Data columns (total 5 columns):
case_url     10693 non-null object
docket       10693 non-null object
year         10693 non-null int64
fulltext     10693 non-null object
case_name    10693 non-null object
dtypes: int64(1), object(4)
memory usage: 501.2+ KB


In [6]:
#modern.tail(10)

In [7]:
import re
def clean(text):
    text = text.replace(",", " ")
    text = text.replace(".", " ")
    text = text.replace("Argued:", " ")
    text = text.replace("United States Supreme Court", " ")
    keep = re.sub("[^a-zA-Z. ""]", "", str(text).lower())
    return keep

In [8]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords

nlp = spacy.load('en')


import nltk
spacystop = nlp.Defaults.stop_words
names = nltk.corpus.names
male_names = names.words('male.txt')
female_names = names.words('female.txt')
male_names = [w.lower() for w in male_names]
male_names_plur = [(w.lower() + "s") for w in male_names]
female_names_plur = [(w.lower() + "s") for w in female_names]
female_names = [w.lower() for w in female_names]
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 
         'august' 'september', 'october', 'november', 'december', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 
         'August' 'September', 'October', 'November', 'December']
more_words = ["court", "justice", "appeals","appeal","united", "may", "argued", "argue", "decide", "rptr", "nervine", "pp","fd" ,"june", "july",
                "august", "september", "october", "november", "states", "ca", "joyce", "certiorari", "december",
                "january", "february", "march", "april", "writ", "supreme", "opinion" ,"cheif", "el", "op", "quotation", 
             "n't", ' ', "'s", "a", "aan", "aba", "aand", "of", "m", "u", "f", "j", "juan", "ca", "u.s.", "u.", 'aa', 'aaa',
              'aab', 'aabd', 'aac',  'aag', 'aai', 'aaii', 'aaiii', 'aalthough', 'zosenjo', 'zschernig']

evenmore_words = ['join', 'seek', 'ginnane', 'kestenbaum', 'hummel', 'loevinger', 'note', 'curiam', 'mosk', 'pd', \
                'paxton', 'rhino', 'buchsbaum', 'hirshowitz', 'misc', 'assistant', 'whereon', 'dismiss', 'sod', \
                'vote', 'present', 'entire', 'frankfurter', 'ante', 'leave', 'concur', 'entire', 'mootness', \
                'track', 'constitution', 'jj', 'blackmun', 'rehnquist', 'amici','sup', 'rep', 'stat', 'messes', \
                'like', 'rev', 'trans', 'bra', 'teller', 'vii', 'erisa', 'usca', 'annas', 'lead', 'cf', 'cca', \
                'fsupp', 'afdc', 'amicus', 'ante', 'orrick', 'kansa', 'pd', 'foth', 'stucky', 'aver',"united", \
                "may", "argued", "argue", "decide", "rptr", "nervine", "pp","fd" ,"june", "july", \
                "august", "september", "october", "november", "state", "joyce", "december",\
                "january", "february", "march", "april", "writ", "supreme court", "court", "dissent", \
                "opinion", "footnote","brief", "decision", "member", "curiam", "dismiss", "note", "affirm", \
                "question", "usc", "file", 'southcarolina', "district", "circuit", "mr", "law", "quoting", "omit", 
                  "amendment","internal", "slip", 'omitted', 'suit' ,'lawsuit', 'marks', "jr", "et" "curiae", "new" ,"york",
                 'se', 'et', 'st', "html", "findlaw ", "hrefhttpscaselaw", "href", "defendant",
                 "judge", "rule", "claim", "comussupremecourt", "petitioner"]
pluswords = ["act", "federal", "statute", "government",
                 "right"]


states = pd.read_csv('states.csv')
states['State'] = states.State.apply(clean)
statelist = []
for index, row in states.iterrows():
    state = row.State
    state = state.split()
    for word in state:
        statelist.append(word)
    

justice_names = pd.read_csv('justices.csv', header = None)
justice_names = justice_names.iloc[0].apply(clean)
justice_names =list(justice_names.values)

In [9]:
STOPLIST = set(stopwords.words('english')  + list(ENGLISH_STOP_WORDS) +justice_names
                + list(female_names) + list(male_names) +  list(spacystop) +
               list(female_names_plur) + list(male_names_plur) + months + more_words + evenmore_words + statelist)

In [13]:
def removestopwords(text):
    txt = nlp(text)
    #print(text)
    #filtered = []
    filtered = [word for word in txt if not (str(word) in STOPLIST)]
    
    return filtered

In [14]:
full.fulltext = full.fulltext.apply(clean)

In [15]:
full.dropna
for index, row in full.iterrows():
    title = row.case_name
    title = str(title).lower()
    title = title.replace('\[|\]|\'|\,', '')
    try:
        row.fulltext = row.fulltext.replace(title, " ")
    except TypeError:
        print()

In [16]:
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23435 entries, 0 to 23434
Data columns (total 5 columns):
case_url     23435 non-null object
docket       23435 non-null object
year         23435 non-null int64
fulltext     23435 non-null object
case_name    23435 non-null object
dtypes: int64(1), object(4)
memory usage: 915.5+ KB


In [17]:
import string 

punctuations = string.punctuation

def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in STOPLIST and tok not in punctuations)] 
    tokens = [tok for tok in tokens if (len(tok)>1)]
    return tokens

In [18]:
full['lem'] = full.fulltext.apply(spacy_tokenizer)
full.to_pickle('fulllem.pickle')
#took over 4 hours

In [19]:
def shorten(txt):
    return txt[:350]

full['lemshort'] = full.lem.apply(shorten)

In [39]:
full['lemshort'] = full.lemshort.apply(removestopwords)
#~15 min

In [40]:
full['lemshort'] = full.lemshort.astype(str).str.replace('\[|\]|\'|\,', '')
full.head(5)

Unnamed: 0,case_url,docket,year,fulltext,case_name,lem,lemshort,topicnumber
0,https://caselaw.findlaw.com/us-supreme-court/0...,51101,2007,united states v timothy w omer...,UNITED STATES v. TIMOTHY W. OMER,omer petition statement respect denial petitio...,omer petition statement respect denial petitio...,12
1,https://caselaw.findlaw.com/us-supreme-court/0...,611951,2007,boumediene v bush no ...,BOUMEDIENE v. BUSH,boumediene bush petition statement respect den...,boumediene bush petition statement respect den...,15
2,https://caselaw.findlaw.com/us-supreme-court/0...,6263,2007,rachel haas carol haas and richa...,"RACHEL HAAS, CAROL HAAS, AND RICHARD HAAS v. ...",haas haas haas quest recovery service motion i...,haas haas haas quest recovery service motion i...,15
3,https://caselaw.findlaw.com/us-supreme-court/0...,65590,2007,ronnie joseph v united states ...,RONNIE JOSEPH v. UNITED STATES,petition statement respect denial petition dic...,petition statement respect denial petition dic...,17
4,https://caselaw.findlaw.com/us-supreme-court/0...,71390,2008,patrick marlowe v united states...,PATRICK MARLOWE v. UNITED STATES,petition denial prison guard failure provide n...,petition denial prison guard failure provide n...,17


In [156]:
#modern.to_pickle('modernlem.pickle')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [22]:
full.dropna(inplace = True)

In [23]:
full.drop([970, 21396 ], inplace = True)

In [24]:
full['lemshort'] = full.lemshort.astype(str).str.replace('\[|\]|\'|\,', '')
full['lem'] = full.lem.astype(str).str.replace('\[|\]|\'|\,', '')

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF

def nmf_topics(corpus):
    max_df = 0.8
    topics = 20
    features = 6000
    top_words = 30
    
    tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = 5, max_features = features)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    
    nmf = NMF(n_components = topics,random_state = 42, alpha = 0.1, l1_ratio = 0.5).fit(tfidf_matrix)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    
    topicdict = {}
    for idx, topic in enumerate(nmf.components_):
        topicdict[idx]=" ".join([tfidf_features[i] for i in topic.argsort()[:-top_words -1:-1]])
    return tfidf_matrix, nmf, topicdict

In [41]:
tfidf_matrix, nmf_test, nmfword = nmf_topics(full.lemshort)


In [27]:
pickle.dump(tfidf_matrix, open('tfidf_full1.pickle', 'wb'))
pickle.dump(nmf_test, open( 'nmfmodelfull1.pickle', 'wb'))
pickle.dump(nmfword, open( 'nmfdictfull1.pickle', 'wb'))

In [42]:
max_df = 0.8
topics = 20
features = 6000
top_words = 25

tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = 5,max_features = features)
tfidf_matrix = tfidf_vectorizer.fit_transform(full.lemshort)


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf_test, tfidf_feature_names,25)        


Topics in NMF model:
Topic #0:
error plaintiff ct judgment jurisdiction action decree messrs bring record statute ground county order party citizen allege fact right motion render review appear final complaint
Topic #1:
attorney general solicitor deputy curiae judgment supp appellee motion appellant jame thoma cox lefkowitz pro reverse urge equally divide dougla special charl respondent improvidently iii
Topic #2:
want appellant appellee paper substantial motion treat findlaw jurisdiction petition federal report probable pro dougla supp city corp county consideration ill app lefkowitz pa thoma
Topic #3:
tax income revenue property taxpayer pay assessment taxation assess year commissioner corporation business value sale impose stock taxable ct gross net return estate refund deduction
Topic #4:
commission rate interstate commerce carrier order gas transportation railroad appellant service line tariff intrastate public freight shipment charge shipper motor ct appellee act power cent
Topi

In [31]:
pickle.dump(tfidf_vectorizer , open('tfidf_vect1.pickle', 'wb'))


In [43]:
out =nmf_test.transform(tfidf_matrix)

import operator
topics = []
for item in out:
    max_ix, max_val = max(enumerate(item), key = operator.itemgetter(1))
    topics.append(max_ix)
    
full["topicnumber"] = pd.Series(topics, index=full.index)

In [44]:
full.topicnumber.value_counts()

15    3625
5     2300
12    1788
3     1564
0     1296
4     1247
14    1147
9     1115
7     1069
8      978
18     967
10     896
17     862
6      825
2      722
13     721
1      659
11     593
16     574
19     485
Name: topicnumber, dtype: int64

In [32]:
full.tail()

Unnamed: 0,case_url,docket,year,fulltext,case_name,lem,lemshort,topicnumber
23430,https://caselaw.findlaw.com/us-supreme-court/9...,273,1878,wheeling parkersburg cincinnati ...,"WHEELING, PARKERSBURG & CINCINNATI TRANSP. CO....",wheel parkersburg cincinnati transp city error...,wheel parkersburg cincinnati transp city error...,3
23431,https://caselaw.findlaw.com/us-supreme-court/9...,508,1878,u s v germaine de...,U.S. v. GERMAINE,certificate division fact attorneygeneral thom...,certificate division fact attorneygeneral thom...,0
23432,https://caselaw.findlaw.com/us-supreme-court/9...,635,1878,northern transp co v city of ch...,NORTHERN TRANSP. CO. v. CITY OF CHICAGO,northern transp city chicago error northern ac...,northern transp city chicago error northern ac...,18
23433,https://caselaw.findlaw.com/us-supreme-court/9...,700,1878,in re sinking fund cases no ...,IN RE SINKING FUND CASES,sink fund union pacific railroad company petit...,sink fund union pacific railroad company petit...,8
23434,https://caselaw.findlaw.com/us-supreme-court/9...,90807,2010,harry r jackson et al v distr...,HARRY R. JACKSON ET AL. v. DISTRICT OF COLUM...,board election ethic chief voter subject colum...,board election ethic chief voter subject colum...,15


In [33]:
year_df = full[['year','topicnumber']]

In [34]:
year_df['count']=1
year_df = year_df.groupby(['year', 'topicnumber']).count().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
year_df.tail()


Unnamed: 0,year,topicnumber,count
2586,2018,15,16
2587,2018,16,2
2588,2018,17,6
2589,2018,18,2
2590,2018,19,2


In [36]:
year_df.groupby('year')['count'].sum()

year
1760      1
1762      1
1763      2
1764      3
1767      1
1768      2
1773      2
1792      1
1793      2
1795      2
1796      4
1798      3
1799      2
1800      2
1801      1
1802      1
1803      3
1804      2
1805      3
1806      4
1807      2
1808      2
1809      3
1810      4
1812      6
1813      4
1814      1
1815      5
1816      6
1817      1
       ... 
1989    181
1990    168
1991    154
1992    123
1993    115
1994     98
1995     97
1996     90
1997     89
1998     95
1999     71
2000     80
2001     87
2002     87
2003     80
2004     83
2005     80
2006     77
2007     82
2008     82
2009     98
2010    103
2011     92
2012     93
2013     96
2014    101
2015     86
2016    102
2017     85
2018     42
Name: count, Length: 231, dtype: int64

In [157]:
year_df

Unnamed: 0,year,topicnumber,count,yearcount
0,1954,0,22,0
1,1954,2,1,0
2,1954,3,2,0
3,1954,4,2,0
4,1954,5,9,0
5,1954,6,3,0
6,1954,7,8,0
7,1954,8,2,0
8,1954,9,9,0
9,1954,10,1,0


In [107]:
full.head()

Unnamed: 0,case_url,docket,year,fulltext,case_name,lem,lemshort,topicnumber
0,https://caselaw.findlaw.com/us-supreme-court/0...,51101,2007,united states v timothy w omer...,UNITED STATES v. TIMOTHY W. OMER,state omer petition statement respect denial p...,state omer petition statement respect denial p...,11
1,https://caselaw.findlaw.com/us-supreme-court/0...,611951,2007,boumediene v bush no ...,BOUMEDIENE v. BUSH,boumediene bush petition statement respect den...,boumediene bush petition statement respect den...,0
2,https://caselaw.findlaw.com/us-supreme-court/0...,6263,2007,rachel haas carol haas and richa...,"RACHEL HAAS, CAROL HAAS, AND RICHARD HAAS v. ...",haas haas haas quest recovery service et motio...,haas haas haas quest recovery service et motio...,0
3,https://caselaw.findlaw.com/us-supreme-court/0...,65590,2007,ronnie joseph v united states ...,RONNIE JOSEPH v. UNITED STATES,state petition statement respect denial petiti...,state petition statement respect denial petiti...,3
4,https://caselaw.findlaw.com/us-supreme-court/0...,71390,2008,patrick marlowe v united states...,PATRICK MARLOWE v. UNITED STATES,state petition denial prison guard failure pro...,state petition denial prison guard failure pro...,3


<pandas.core.groupby.DataFrameGroupBy object at 0x1a6b2fad68>

In [171]:
year_df.head()

Unnamed: 0,year,topicnumber,count,yearcount
0,1954,0,22,0
1,1954,2,1,0
2,1954,3,2,0
3,1954,4,2,0
4,1954,5,9,0


In [37]:
year_df = year_df.pivot("topicnumber", "year", "count")

In [38]:
year_df.to_pickle('topicsfull.pickle')