## Grouping and finding similarities of justices by year

the beginning of this notebook is similar to the last two but I am looking at each justices writings from each year rather than individual cases

In [35]:
import pandas as pd
import numpy as np
import spacy

In [36]:
import pickle
court = pd.read_pickle('opbyjusticerecent.pickle')

In [37]:
import re
def cleanerup(text):
    text = text.replace(",", " ")
    text = text.replace(".", " ")
    text = text.replace("Argued:", " ")
    text = text.replace("United States Supreme Court", " ")
    keep = re.sub("[^a-zA-Z. ""]", "", str(text).lower())
    return keep


In [38]:
court.text = court.text.apply(cleanerup)

In [39]:
court.head()

Unnamed: 0,justice,year,text
0,ALITO,2006,the supreme court of southcarolina may ...
1,ALITO,2007,the court of appeal of california first appe...
2,ALITO,2008,the supreme court of louisiana march ...
3,ALITO,2009,this is the latest in a line of contes...
4,ALITO,2010,the united states court of appeals for the se...


In [85]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords

nlp = spacy.load('en')


import nltk
spacystop = nlp.Defaults.stop_words
names = nltk.corpus.names
male_names = names.words('male.txt')
female_names = names.words('female.txt')
male_names = [w.lower() for w in male_names]
male_names_plur = [(w.lower() + "s") for w in male_names]
female_names_plur = [(w.lower() + "s") for w in female_names]
female_names = [w.lower() for w in female_names]
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 
         'august' 'september', 'october', 'november', 'december', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 
         'August' 'September', 'October', 'November', 'December']
more_words = ["court", "justice", "appeals","appeal","united", "may", "argued", "argue", "decide", "rptr", "nervine", "pp","fd" ,"june", "july",
                "august", "september", "october", "november", "states", "ca", "joyce", "certiorari", "december",
                "january", "february", "march", "april", "writ", "supreme", "opinion"]
evenmore_words = ['join', 'seek', 'ginnane', 'kestenbaum', 'hummel', 'loevinger', 'note', 'curiam', 'mosk', 'pd', \
                'paxton', 'rhino', 'buchsbaum', 'hirshowitz', 'misc', 'assistant', 'whereon', 'dismiss', 'sod', \
                'vote', 'present', 'entire', 'frankfurter', 'ante', 'leave', 'concur', 'entire', 'mootness', \
                'track', 'constitution', 'jj', 'blackmun', 'rehnquist', 'amici','sup', 'rep', 'stat', 'messes', \
                'like', 'rev', 'trans', 'bra', 'teller', 'vii', 'erisa', 'usca', 'annas', 'lead', 'cf', 'cca', \
                'fsupp', 'afdc', 'amicus', 'ante', 'orrick', 'kansa', 'pd', 'foth', 'stucky', 'aver',"united", \
                "may", "argued", "argue", "decide", "rptr", "nervine", "pp","fd" ,"june", "july", \
                "august", "september", "october", "november", "states", "joyce", "certiorari", "december",\
                "january", "february", "march", "april", "writ", "supreme court", "court", "dissent", \
                "opinion", "footnote","brief", "decision", "member", "curiam", "dismiss", "note", "affirm", \
                "question", "usc", "file", 'southcarolina', "district", "circuit", "slip", "op"]

In [138]:
states = pd.read_csv('states.csv')
states['State'] = states.State.apply(cleanerup)
states = states.State.tolist()

justice_names = pd.read_csv('justices.csv', header = None)
justice_names = justice_names.iloc[0].apply(cleanerup)
justice_names =list(justice_names.values)


In [12]:
STOPLIST = set(stopwords.words('english')  + list(ENGLISH_STOP_WORDS) +justice_names
                + list(female_names) + list(male_names) +  list(spacystop) +
               list(female_names_plur) + list(male_names_plur) + months + more_words + evenmore_words + states)

In [13]:
nlp = spacy.load('en_core_web_sm')


In [87]:
def stopword(txt):
    
    words = txt.split(' ')
    texts = [word for word in words if word not in STOPLIST]
    
    return ' '.join(texts)

In [88]:
text = stopword(text)

In [89]:
from nltk.stem.wordnet import WordNetLemmatizer


def lemmatize(text):
    words1 = []
    doc = nlp(str(text))
    for token in doc:
        if token.lemma_ != "-PRON-":
             if re.search('[a-zA-Z]{3,}', token.lemma_):
                words1.append(token.lemma_)
    
    return ' '.join(words1)


In [91]:
#lemmatize(text)

In [79]:
court1 = court.copy()


In [81]:
court1['text'] = court1.text.apply(stopword)
court1['text'] = court1.text.apply(lemmatize)


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, 
                                 use_idf=True, tokenizer =lemmatize,  ngram_range=(1,3))


tfidf_matrix = tfidf_vectorizer.fit_transform(court1.text) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

(593, 9734)


In [98]:
terms = tfidf_vectorizer.get_feature_names()


In [99]:
terms

['aa',
 'aa footnote',
 'ab',
 'aba',
 'abandonment',
 'abate',
 'abet',
 'abide',
 'abolish',
 'abortion',
 'abridge',
 'abridgment',
 'abroad',
 'abrogate',
 'abrogation',
 'absence clear',
 'absence evidence',
 'absence express',
 'absent clear',
 'absolute right',
 'absolutely',
 'absorb',
 'abstain',
 'abstention',
 'abstract',
 'absurd',
 'abundantly',
 'abundantly clear',
 'abuse discretion',
 'abusive',
 'ac',
 'academic',
 'academy',
 'accede',
 'accept argument',
 'accept court',
 'accept petitioner',
 'acceptable',
 'acceptance',
 'accessible',
 'accident',
 'accidental',
 'accommodate',
 'accommodation',
 'accomplice',
 'accomplish purpose',
 'accomplished',
 'accomplishment',
 'accord government',
 'accord law',
 'accord petitioner',
 'accord respondent',
 'accordance law',
 'accorded',
 'accordingly affirm',
 'accordingly affirm judgment',
 'accordingly conclude',
 'accordingly hold',
 'accordingly judgment',
 'accordingly judgment reverse',
 'accordingly reverse',
 'acco

In [103]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_matrix)

In [104]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [105]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names,20)


Topics in NMF model:
Topic #0:
sentencing slip op slip drug op internal quotation quotation omit internal quotation omit speech scalia quotation guideline death penalty circuit justice plurality juror habea murder prison sovereign
Topic #1:
mr justice appellant carrier railroad appellee bargaining taxpayer gas contempt interstate commerce grand antitrust grand jury picket cir competition indictment stock vessel negro
Topic #2:
indian tribe reservation tribal treaty indian tribe nonmember fishing river montana interior fish sovereign sovereignty secretary interior alaska mineral acre taxation timber
Topic #3:
parole speech revocation inmate simmon parent oath ex post prisoner forfeiture prison kansa commitment facto hospital nebraska alcohol mental rail rico
Topic #4:
religious religion establishment clause establishment church student prayer public school free exercise teacher god religious belief display church state neutrality catholic endorsement educational message parent
Topic #5

In [180]:
import pickle
pickle.dump(tfidf_matrix, open('tfidf_scj.pickle', 'wb'))
pickle.dump(nmf, open( 'nmfmodel.pickle', 'wb'))

In [108]:
out =nmf.transform(tfidf_matrix)

import operator
topics = []
for item in out:
    max_ix, max_val = max(enumerate(item), key = operator.itemgetter(1))
    topics.append(max_ix)
    
court1["topicnumber"] = pd.Series(topics, index=court1.index)

In [109]:
court1.topicnumber.value_counts()

1     228
0     206
2      22
4      17
7      13
8      11
6       9
19      9
12      9
13      9
15      9
16      7
17      7
10      6
18      6
14      6
9       6
5       5
3       5
11      3
Name: topicnumber, dtype: int64

In [110]:
topics_prob = []
for item in out:
    max_ix, max_val = max(enumerate(item), key = operator.itemgetter(1))
    topics_prob.append(max_val)

court1["topicstrength"] = pd.Series(topics_prob, index=court1.index)

In [112]:
court1

Unnamed: 0,justice,year,text,topicnumber,topicstrength
0,ALITO,2006,supreme southcarolina justice alito deliver op...,17,0.164106
1,ALITO,2007,appeal california appellate district justice a...,0,0.238028
2,ALITO,2008,supreme louisiana justice alito deliver opinio...,0,0.273983
3,ALITO,2009,late line contest matter come action bring sta...,0,0.291174
4,ALITO,2010,second circuit justice alito deliver opinion g...,0,0.259777
5,ALITO,2011,ninth circuit justice alito chief justice join...,0,0.281552
6,ALITO,2012,ninth circuit justice alito deliver opinion pr...,0,0.325339
7,ALITO,2013,second circuit justice alito justice join conc...,0,0.323958
8,ALITO,2014,seventh circuit justice alito deliver opinion ...,0,0.193576
9,ALITO,2015,fourth circuit justice alito concur judgment o...,0,0.319504


In [111]:
#make a cosine similarity matrix from the tfidf matrix
from sklearn.metrics.pairwise import cosine_similarity
cosinesim = 1- cosine_similarity(tfidf_matrix)
cosinesim

array([[-4.44089210e-16,  8.79890983e-01,  8.30348315e-01, ...,
         9.34656692e-01,  9.14602060e-01,  9.29796256e-01],
       [ 8.79890983e-01,  2.22044605e-16,  7.74595329e-01, ...,
         9.08701642e-01,  9.29076704e-01,  9.30600714e-01],
       [ 8.30348315e-01,  7.74595329e-01,  0.00000000e+00, ...,
         8.86142509e-01,  9.18617862e-01,  9.13572840e-01],
       ...,
       [ 9.34656692e-01,  9.08701642e-01,  8.86142509e-01, ...,
        -4.44089210e-16,  8.64228875e-01,  8.29015440e-01],
       [ 9.14602060e-01,  9.29076704e-01,  9.18617862e-01, ...,
         8.64228875e-01, -4.44089210e-16,  8.26823547e-01],
       [ 9.29796256e-01,  9.30600714e-01,  9.13572840e-01, ...,
         8.29015440e-01,  8.26823547e-01, -2.22044605e-16]])

In [238]:
cosinesim.shape

(593, 593)

In [241]:
justiceyearlist = []
for ix, row in court1.iterrows():
    justice = row.justice
    year = str(row.year)
    
    justiceyearlist.append(str(justice + "_" + year))
    

justiceyearlist
    


['ALITO_2006',
 'ALITO_2007',
 'ALITO_2008',
 'ALITO_2009',
 'ALITO_2010',
 'ALITO_2011',
 'ALITO_2012',
 'ALITO_2013',
 'ALITO_2014',
 'ALITO_2015',
 'ALITO_2016',
 'ALITO_2017',
 'ALITO_2018',
 'BLACK_1955',
 'BLACK_1956',
 'BLACK_1957',
 'BLACK_1958',
 'BLACK_1959',
 'BLACK_1960',
 'BLACK_1961',
 'BLACK_1962',
 'BLACK_1963',
 'BLACK_1964',
 'BLACK_1965',
 'BLACK_1966',
 'BLACK_1967',
 'BLACK_1968',
 'BLACK_1969',
 'BLACK_1970',
 'BLACK_1971',
 'BLACKMUN_1971',
 'BLACKMUN_1972',
 'BLACKMUN_1973',
 'BLACKMUN_1974',
 'BLACKMUN_1975',
 'BLACKMUN_1976',
 'BLACKMUN_1977',
 'BLACKMUN_1978',
 'BLACKMUN_1979',
 'BLACKMUN_1980',
 'BLACKMUN_1981',
 'BLACKMUN_1982',
 'BLACKMUN_1983',
 'BLACKMUN_1984',
 'BLACKMUN_1985',
 'BLACKMUN_1986',
 'BLACKMUN_1987',
 'BLACKMUN_1988',
 'BLACKMUN_1989',
 'BLACKMUN_1990',
 'BLACKMUN_1991',
 'BLACKMUN_1992',
 'BLACKMUN_1993',
 'BLACKMUN_1994',
 'BRENNAN_1956',
 'BRENNAN_1957',
 'BRENNAN_1958',
 'BRENNAN_1959',
 'BRENNAN_1960',
 'BRENNAN_1961',
 'BRENNAN_1962',

In [242]:
cosinesimilarity = pd.DataFrame(data = cosinesim, index = justiceyearlist,  columns = justiceyearlist)

In [249]:
cosinesimilarity.loc['ALITO_2008','ALITO_2009']

0.7744858751687292

In [250]:
cosinesimilarity.loc['ALITO_2008','SOTOMAYOR_2018']

0.8322070419497207

In [101]:
from sklearn.cluster import KMeans
num_clusters = 4
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [203]:
clustcount = pd.Series(clusters)
#clustcount.value_counts
court1['clusters_4'] = pd.Series(clusters, index = court1.index)

In [204]:
from sklearn.cluster import KMeans
num_clusters = 6
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [205]:
clustcount = pd.Series(clusters)
#clustcount.value_counts
court1['clusters_6'] = pd.Series(clusters, index = court1.index)

In [207]:
court1.tail()

Unnamed: 0,justice,year,text,topicnumber,topicstrength,clusters_4,clusters_6
588,WHITTAKER,1957,question present u c b sole exclusive provisio...,10,0.220837,0,2
589,WHITTAKER,1958,involve legality conviction petitioner alien p...,1,0.25042,0,1
590,WHITTAKER,1959,petitioner convict knowingly conceal transport...,1,0.255812,0,1
591,WHITTAKER,1960,respondent indict murder district district col...,2,0.205506,3,0
592,WHITTAKER,1961,information charge assault murder degree petit...,16,0.255613,0,1


In [197]:
just_list = list(court.justice.unique())
just_list

['ALITO',
 'BLACK',
 'BLACKMUN',
 'BRENNAN',
 'BREYER',
 'BURGER',
 'BURTON',
 'CLARK',
 'CURTIS',
 'DOUGLAS',
 'FORTAS',
 'FRANKFURTER',
 'GINSBURG',
 'GOLDBERG',
 'GORSUCH',
 'HARLAN',
 'HOLMES',
 'KAGAN',
 'KENNEDY',
 'MARSHALL',
 'MINTON',
 "O'CONNOR",
 'POWELL',
 'REED',
 'REHNQUIST',
 'ROBERTS',
 'RUTLEDGE',
 'SCALIA',
 'SOTOMAYOR',
 'SOUTER',
 'STEVENS',
 'STEWART',
 'THOMAS',
 'WARREN',
 'WHITE',
 'WHITTAKER']

In [201]:
len(just_list)

36

In [140]:
just_list[0]

'ALITO'

In [173]:
#make a dataframe for each justice and put it in a dictionary

d = {}
for name in just_list:

    pattern = name
    mask = court.justice == pattern
    newdf = court.loc[mask]
    d[name] = newdf
    


In [198]:
for i in range(len(just_list)):
    print(d[just_list[i]])


   justice  year                                               text  \
0    ALITO  2006   the supreme court of southcarolina may       ...   
1    ALITO  2007   the court of appeal of california  first appe...   
2    ALITO  2008   the supreme court of louisiana march         ...   
3    ALITO  2009          this is the latest in a line of contes...   
4    ALITO  2010   the united states court of appeals for the se...   
5    ALITO  2011   the united states court of appeals for the ni...   
6    ALITO  2012   the united states court of appeals for the ni...   
7    ALITO  2013   the united states court of appeals for the se...   
8    ALITO  2014   the united states court of appeals for the se...   
9    ALITO  2015   the united states court of appeals for the fo...   
10   ALITO  2016   the united states court of appeals for the ni...   
11   ALITO  2017   the united states court of appeals for the fe...   
12   ALITO  2018   the united states court of appeals for the ni...   

    t

    justice  year                                               text  \
113  BURGER  1969   the same court    petitioner in no    a natio...   
114  BURGER  1970      the interstate commerce commission orders ...   
115  BURGER  1971     we granted the writ in this case to conside...   
116  BURGER  1972     we granted the writ of certiorari to review...   
117  BURGER  1973     we granted the writ in this case to conside...   
118  BURGER  1974     we granted certiorari to consider petitione...   
119  BURGER  1975     we granted certiorari to decide whether in ...   
120  BURGER  1976     the question presented in this case is whet...   
121  BURGER  1977     we granted certiorari in these cases     u ...   
122  BURGER  1978     we noted probable jurisdiction in this case...   
123  BURGER  1979     we granted certiorari     u s     to consid...   
124  BURGER  1980     the question in this case is whether the sh...   
125  BURGER  1981     we granted certiorari     u s     to consi

     justice  year                                               text  \
243  KENNEDY  1988     under the medicare program  title xviii of ...   
244  KENNEDY  1989     we consider here the circumstances under wh...   
245  KENNEDY  1990     the central question before us is whether a...   
246  KENNEDY  1991     the principal question before us is whether...   
247  KENNEDY  1992     the interstate commerce commission icc or c...   
248  KENNEDY  1993     we address in this decision the appropriate...   
249  KENNEDY  1994      the power of state and local governments t...   
250  KENNEDY  1995        though recent acts of congress have made...   
251  KENNEDY  1996      we granted certiorari to resolve a divisio...   
252  KENNEDY  1997   with respect to all but a portion of part iia...   
253  KENNEDY  1998     an employee  as part of a termination agree...   
254  KENNEDY  1999   the united states court of appeals for the ni...   
255  KENNEDY  2000   the supreme court of californi

    justice  year                                               text  \
396  SCALIA  1986     the petitioners  united states citizen empl...   
397  SCALIA  1987     in coolidge v  new hampshire     u s      w...   
398  SCALIA  1988     petitioner united savings association of te...   
399  SCALIA  1989     following a jury trial in the pennsylvania ...   
400  SCALIA  1990     in this case we must decide whether the act...   
401  SCALIA  1991     this case requires us to clarify the applic...   
402  SCALIA  1992     the question presented by these consolidate...   
403  SCALIA  1993     this case presents the question whether the...   
404  SCALIA  1994      this case presents the question whether  i...   
405  SCALIA  1995      in this case we consider whether actions o...   
406  SCALIA  1996      this case presents the question whether  i...   
407  SCALIA  1997       title vii of the civil rights act of  app...   
408  SCALIA  1998     under longstanding precedent of the nation

    justice  year                                               text  \
556   WHITE  1962     the respondent company employs at its refin...   
557   WHITE  1963     in the face of petitioners claim that the s...   
558   WHITE  1964     the issue here is whether the kentucky cour...   
559   WHITE  1965     pursuant to a year lease with the city of g...   
560   WHITE  1966     the disputed issue here is whether a bankru...   
561   WHITE  1967     this case presents still another developmen...   
562   WHITE  1968     a jury returned a verdict for respondent in...   
563   WHITE  1969     the roofing contractors association of sout...   
564   WHITE  1970     petitioner was found guilty by a jury on fo...   
565   WHITE  1971     petitioners  coal mine operators in southea...   
566   WHITE  1972     we are asked to determine whether the feder...   
567   WHITE  1973     the question before us is whether in this a...   
568   WHITE  1974     the respondents are  named individuals who

In [200]:

for i in just_list:
    print (d[i])

   justice  year                                               text  \
0    ALITO  2006   the supreme court of southcarolina may       ...   
1    ALITO  2007   the court of appeal of california  first appe...   
2    ALITO  2008   the supreme court of louisiana march         ...   
3    ALITO  2009          this is the latest in a line of contes...   
4    ALITO  2010   the united states court of appeals for the se...   
5    ALITO  2011   the united states court of appeals for the ni...   
6    ALITO  2012   the united states court of appeals for the ni...   
7    ALITO  2013   the united states court of appeals for the se...   
8    ALITO  2014   the united states court of appeals for the se...   
9    ALITO  2015   the united states court of appeals for the fo...   
10   ALITO  2016   the united states court of appeals for the ni...   
11   ALITO  2017   the united states court of appeals for the fe...   
12   ALITO  2018   the united states court of appeals for the ni...   

    t

    justice  year                                               text  \
132  BURTON  1955     the issue here is whether  in determining a...   
133  BURTON  1956     this case presents two questions as to the ...   
134  BURTON  1957     a joint trial in this case resulted in the ...   
135  BURTON  1958     the issue in this case is whether the assim...   

     topicnumber  
132            1  
133            1  
134            1  
135            1  
    justice  year                                               text  \
136   CLARK  1955     the main question presented in this case is...   
137   CLARK  1956     petitioner contends that this action brough...   
138   CLARK  1957     these two consolidated cases present a ques...   
139   CLARK  1958     this case is a sequel to standard oil co  v...   
140   CLARK  1959     this civil sherman act   case was here four...   
141   CLARK  1960     this direct appeal tests the constitutional...   
142   CLARK  1961     the sole question 

      justice  year                                               text  \
274  MARSHALL  1967     these consolidated cases raise the question...   
275  MARSHALL  1968     appellants are an exhibitor and the distrib...   
276  MARSHALL  1969     this case involves the application of  of t...   
277  MARSHALL  1970     appellants are beneficiaries of new york de...   
278  MARSHALL  1971     in this cause we are asked to determine whe...   
279  MARSHALL  1972     the medical committee for human rights acqu...   
280  MARSHALL  1973     in this case we must decide whether a distr...   
281  MARSHALL  1974     section  of the civil rights act of    stat...   
282  MARSHALL  1975     respondent george j  wilson  jr   was tried...   
283  MARSHALL  1976     these companion cases involve two taxpayers...   
284  MARSHALL  1977     this case raises important questions concer...   
285  MARSHALL  1978     the issue in this case is the constitutiona...   
286  MARSHALL  1979     at issue in th

    justice  year                                               text  \
396  SCALIA  1986     the petitioners  united states citizen empl...   
397  SCALIA  1987     in coolidge v  new hampshire     u s      w...   
398  SCALIA  1988     petitioner united savings association of te...   
399  SCALIA  1989     following a jury trial in the pennsylvania ...   
400  SCALIA  1990     in this case we must decide whether the act...   
401  SCALIA  1991     this case requires us to clarify the applic...   
402  SCALIA  1992     the question presented by these consolidate...   
403  SCALIA  1993     this case presents the question whether the...   
404  SCALIA  1994      this case presents the question whether  i...   
405  SCALIA  1995      in this case we consider whether actions o...   
406  SCALIA  1996      this case presents the question whether  i...   
407  SCALIA  1997       title vii of the civil rights act of  app...   
408  SCALIA  1998     under longstanding precedent of the nation

    justice  year                                               text  \
541  WARREN  1955     this is a civil antitrust action brought by...   
542  WARREN  1956     this case raises an issue of coverage under...   
543  WARREN  1957     petitioner is under sentence of death for t...   
544  WARREN  1958     in this  the third of the denationalization...   
545  WARREN  1959     petitioner  the secretary of labor  brought...   
546  WARREN  1960     jesse blackburn was tried in the circuit co...   
547  WARREN  1961     we granted certiorari to review the decisio...   
548  WARREN  1962     the question to be decided in this case is ...   
549  WARREN  1963     this direct appeal from a threejudge distri...   
550  WARREN  1964     the issue presented in this case is whether...   
551  WARREN  1965     at issue in this case is the effect of exec...   
552  WARREN  1966     we granted certiorari in this case to consi...   
553  WARREN  1967     the question involved in this case is whet

In [195]:
d['BRENNAN']

Unnamed: 0,justice,year,text,topicnumber
54,BRENNAN,1956,the petitioner max putnam in december p...,14
55,BRENNAN,1957,a jury in the circuit court of st louis aw...,1
56,BRENNAN,1958,the state of illinois the illinois commerc...,1
57,BRENNAN,1959,as the result of multiemployer multistate ...,1
58,BRENNAN,1960,the national bituminous coal wage agreement...,1
59,BRENNAN,1961,the petitioner became a naturalized citizen...,1
60,BRENNAN,1962,the federal trade commission seeks reversal...,1
61,BRENNAN,1963,this case originated in companion suits by ...,15
62,BRENNAN,1964,the chattanooga building trades council af...,1
63,BRENNAN,1965,georgias senatorial reapportionment act ...,1


In [234]:
#dataframe with each justice as row, all years are together
justicesingle= court1.groupby(['justice'])['text'].apply(' '.join).reset_index()

In [236]:
justicesingle.tail()

Unnamed: 0,justice,text
31,STEWART,evening motor vessel tungus docked bayonne new...
32,THOMAS,require determine scope statutory prohibition ...
33,WARREN,civil antitrust action bring government distri...
34,WHITE,respondent company employ refinery east chicag...
35,WHITTAKER,question present u c b sole exclusive provisio...


## Loading the vectorizer and model from "SC full" notebook
dont fit! just transform

In [206]:
tfidf_vect = pickle.load(open("tfidf_vect.pickle", 'rb'))
nfm_model = pickle.load(open( 'nmfmodelfull.pickle', 'rb'))

In [213]:
tfmatrix = tfidf_vect.transform(court1.text)

In [214]:
nfm_model.transform(tfmatrix)

array([[0.07181366, 0.        , 0.        , ..., 0.        , 0.        ,
        0.02965594],
       [0.09230427, 0.        , 0.        , ..., 0.        , 0.01045282,
        0.01272126],
       [0.07132773, 0.00028533, 0.        , ..., 0.00176191, 0.00026137,
        0.03812208],
       ...,
       [0.04960789, 0.00023369, 0.        , ..., 0.00537678, 0.00372436,
        0.        ],
       [0.06906909, 0.0022811 , 0.        , ..., 0.        , 0.00068229,
        0.00204476],
       [0.06208504, 0.00127958, 0.        , ..., 0.00037104, 0.00064293,
        0.        ]])

In [215]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vect.get_feature_names()
print_top_words(nfm_model, tfidf_feature_names,20)


Topics in NMF model:
Topic #0:
act federal respondent claim action petitioner statute rule congress hold government jurisdiction right regulation provide complaint provision damage require service
Topic #1:
appellant appellee motion supp et judgment probable jr jurisdiction city statute solicitor appelle threejudge general cox dougla shapiro county child
Topic #2:
vacate remand forma pauperis judgment petition proceed light consideration motion petitioner se pro respondent solicitor report chapman reason rosenberg general
Topic #3:
death sentence penalty sentencing jury circumstance punishment capital mitigate murder impose eighth aggravating cruel life factor offense judge instruction unusual
Topic #4:
general attorney solicitor deputy jr et curiae jame thoma respondent urge divide equally pro se judgment improvidently reverse charl special
Topic #5:
tax income property taxpayer revenue pay sale commissioner business refund taxation corporation lien code deduction return commerce ban

In [216]:
out =nfm_model.transform(tfmatrix)

import operator
topics = []
for item in out:
    max_ix, max_val = max(enumerate(item), key = operator.itemgetter(1))
    topics.append(max_ix)
    
court1["toptopic"] = pd.Series(topics, index=court1.index)

In [229]:
likelytopics = pd.DataFrame(data = out, 
                           columns = ["misc.","dismissed", "city jurisdiction", "taxes", "interstate commerce", "citizen rights", 
              "vacate","property rights", "railroads","bankruptcy/mortgages","labor","banks/stocks",
               "jury/trial law","fourth amendment","injuries/liabilities","civil jurisdiction",
              "patents","capital punishment", "municipality/utilities","Native American"])

In [230]:
likelytopics.head()

Unnamed: 0,misc.,dismissed,city jurisdiction,taxes,interstate commerce,citizen rights,vacate,property rights,railroads,bankruptcy/mortgages,labor,banks/stocks,jury/trial law,fourth amendment,injuries/liabilities,civil jurisdiction,patents,capital punishment,municipality/utilities,Native American
0,0.071814,0.0,0.0,0.004175,0.0,0.0,0.0,0.0,0.0,0.025876,0.0,0.057863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029656
1,0.092304,0.0,0.0,0.045602,0.0,0.009585,0.0,0.0,0.01021,0.007963,0.0,0.014923,0.001618,0.0,0.0,0.0,0.0,0.0,0.010453,0.012721
2,0.071328,0.000285,0.0,0.055363,0.000453,0.027274,0.000471,0.0,0.007452,0.005881,0.0,0.020848,0.00118,0.0,0.000591,0.0,0.05064,0.001762,0.000261,0.038122
3,0.090772,0.0,0.000368,0.008525,0.0,0.0,0.034918,0.0,0.044846,0.0,0.012496,0.024291,0.005643,0.0,0.0,0.000451,0.00068,0.0,0.003622,0.010865
4,0.093009,0.000157,0.0,0.012783,0.005151,0.006024,0.000471,0.002885,0.019835,0.003417,0.001925,0.021401,0.012344,0.0,0.0,0.003068,0.012952,0.009597,0.007546,0.022993


In [231]:
likely_topic = pd.concat([court1[['justice', 'year']], likelytopics], axis = 1)

In [232]:
#df for likelihood of topic by justice and year

likely_topic.tail()

Unnamed: 0,justice,year,misc.,dismissed,city jurisdiction,taxes,interstate commerce,citizen rights,vacate,property rights,...,labor,banks/stocks,jury/trial law,fourth amendment,injuries/liabilities,civil jurisdiction,patents,capital punishment,municipality/utilities,Native American
588,WHITTAKER,1957,0.060612,0.0,0.0,0.007022,0.001206,0.0,0.0,0.0,...,0.0,0.000324,0.000894,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589,WHITTAKER,1958,0.071729,0.010475,0.002347,0.0,0.0,0.014925,0.007757,0.032524,...,0.009422,0.058569,0.006398,0.0,0.0,0.000626,0.009887,0.008244,0.0,0.007013
590,WHITTAKER,1959,0.049608,0.000234,0.0,0.010821,0.0,0.08293,0.028201,0.025793,...,0.0,0.009592,0.000904,0.003504,0.0,0.0,0.002267,0.005377,0.003724,0.0
591,WHITTAKER,1960,0.069069,0.002281,0.0,0.002989,0.0,0.032055,0.0,0.0,...,0.003182,0.028202,0.010294,0.001341,0.000205,0.117875,0.0,0.0,0.000682,0.002045
592,WHITTAKER,1961,0.062085,0.00128,0.0,0.004329,0.0,0.0474,0.037103,0.0017,...,0.0,0.040772,0.002919,6.9e-05,0.000827,0.0,0.004115,0.000371,0.000643,0.0
