In [None]:
from pymongo import MongoClient

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob import Word
import nltk
import sklearn
import pandas as pd
import numpy as np
import re
import pprint

%pylab inline

pd.options.display.max_columns = 999

In [None]:
client = MongoClient()

scotus_db = client.scotus_db
scotus = scotus_db.scotus_collection

### Text Cleaning and Set Up


In [None]:
#Functions for Text Analysis
def sentiment_analysis(x):
    return TextBlob(x).sentiment
def polarity_analysis(x):
    return TextBlob(x).polarity
def subjectivity_analysis(x):
    return TextBlob(x).subjectivity

In [None]:
#Functions for Topics
from sklearn import decomposition
from nltk.stem import WordNetLemmatizer 


def lemmatize_text(text):
    '''
    **Use like this**
    df['lem_text'] = df['text'].apply(lemmatize_text)
    '''
    lemm = WordNetLemmatizer()
    new_text = ''
    for w in nltk.word_tokenize(text): 
        new_text += lemm.lemmatize(w) + ' '
    return new_text

def get_topics_cv(df):
    count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        token_pattern="\\b[a-z][a-z][a-z]+\\b",
        min_df=1
    )
    count_vectorizer.fit(df)
    # Create the term-document matrix
    counts = count_vectorizer.transform(df)
    # this gives us a [num_documents, num_features] sparse matrix
    print(f"Shape: {counts.shape}")
    n_topics = 10
    lda = decomposition.LatentDirichletAllocation\
                        (n_components=n_topics, learning_method="online"
                        , max_iter=5, n_jobs=-1)

    lda.fit(counts)
    print("**Topics**")
    print_top_words(lda, count_vectorizer.get_feature_names(), 15)
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % (topic_idx+1)
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
def topics_by_year(year):
    
    cursor_func = scotus.find({"term": str(year)})
    df_func = pd.DataFrame(list(cursor_func))

    df_func['lem_text'] = df_func['text'].apply(lemmatize_text)
    df_func_2=df_func['lem_text'].copy()
    print(f"SCOTUS Topics for Year: {year}")
    get_topics(df_func_2)

I will run sentiment analysis on all cases in **df**

In [None]:
cursor = scotus.find\
    "term":{"$in":['2006','2007','2008','2009',\
                               '2010','2011','2012','2013',\
                              '2014','2015','2016','2017']}
          
                   
df = pd.DataFrame(list(cursor))

In [None]:
df['tb_polarity'] = df['text'].apply(polarity_analysis) #this takes a while
print("polarity done")
df['tb_subjectivity'] = df['text'].apply(subjectivity_analysis)#this takes a while
print("subjectivity done")

In [None]:
agg_funcs = {"lem_text":'sum', 'tb_subjectivity':'sum', "tb_polarity":'sum'}

df['lem_text']=df['text'].apply(lemmatize_text)
df['case_term'] = df['case_name']+ ' ' + df['term']

df = df.groupby(['case_term']).agg(agg_funcs).reset_index()
case_term = list(df['case_term'])

Now I'm setting up another dataframe **df2** that will have the voting results by justice (who spoke during the case). We'll then join df2 (voting results) with df (text analysis)

In [None]:
df2 = pd.DataFrame(list(cursor))

In [None]:
df_2['lem_text']=df_2['text'].apply(lemmatize_text)
df_2['case_term'] = df_2['case_name']+ ' ' + df_2['term']

df_groupby = df_2.groupby(['case_term','speaker','voting_result']).max()

df_groupby= df_groupby.reset_index()
df_groupby = df_groupby[['case_term','speaker','voting_result']]

df_pivot = df_groupby.pivot(index='case_term',columns='speaker',values='voting_result')

df_votes_cases = df.merge(df_pivot,left_on='case_term',right_index=True)

pd.to_pickle(df_votes_cases,'2006_2017_votes_AND_cases')
df_votes_cases.sample(2)


![Sample](df_votes_cases_sample.png)

### Topic Analysis - Count Vectorizer into LDA


In [None]:
df = df_votes_cases

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(1, 1),
        stop_words='english',
        token_pattern="\\b[a-z][a-z][a-z]+\\b",
        min_df=3,
        max_df=.6
    )
df_text = df['lem_text']
count_vectorizer.fit(df_text)
# Create the term-document matrix
counts = count_vectorizer.transform(df_text)
# this gives us a [num_documents, num_features] sparse matrix

print(f"Shape: {counts.shape}") 
#Shape: (827, 15081) 827 cases, 15,081 words

In [None]:
from sklearn import decomposition
lda = decomposition.LatentDirichletAllocation(
    n_components=30, 
    learning_method="online", 
    verbose=1, 
    max_iter=5, 
    n_jobs=-1
)

lda.fit(counts)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda,count_vectorizer.get_feature_names(),10)

Topic #0: candidate political election speech money amendment race contribution vote public  
Topic #1: officer police amendment fourth search car warrant arrest stop probable  
Topic #2: duty honest mcnally bribe kickback service voir dire disclose intangible  
Topic #3: class agency limitation board review jurisdiction notice percent jurisdictional epa  
Topic #4: plan immunity power sovereign remedy clause official policy military suit  
Topic #5: virginia commerce dormant cost requester staters privileges archive access recoup  
Topic #6: child tribe parent indian tribal father mother reservation jurisdiction marriage  
Topic #7: memorial monument cross display symbol establishment injunction plaque commandments erected  
Topic #8: witness testimony cocaine confrontation clause trial prosecutor crawford prosecution report  
Topic #9: article child standing custody convention country marriage sex iii residence  
Topic #10: information public service amendment officer enforcement force activity speech report  
Topic #11: fda drug trust information duty trustee manufacturer company regulation conflict  
Topic #12: patent contract copyright infringement regulation agency agreement invention interpretation notice  
Topic #13: tax money fraud bank transaction false property loss pay sec  
Topic #14: contract jurisdiction agreement price foreign arbitration market power international treaty  
Topic #15: plaintiff fee damage property money complaint injury attorney award pay  
Topic #16: water compact montana user master river beneficial diversion special wyoming  
Topic #17: employee employer union discrimination pay title employment service policy speech  
Topic #18: jury trial error conviction review proceeding plea instruction defense lawyer  
Topic #19: railroad carrier fcc tax property value transportation regulation truck rail  
Topic #20: dna gene isolated myriad body scientist hansen molecule extracting amazon  
Topic #21: california arbitration arbitrator trustee commissioner labor exemption debtor exempt volt  
Topic #22: jersey income delaware month wharf disposable formula monthly distributor projected  
Topic #23: land property water proximate river easement permit title park vessel  
Topic #24: rico puerto senate recess president power session appointment enterprise member  
Topic #25: commission mark carolina north arm amendment seed militia broadcast corps  
Topic #26: charge notice sorna amendment percent filed race injury water arm  
Topic #27: warning florida fbi miranda constitution interrogation presence attorney questioning convey  
Topic #28: bankruptcy debt school debtor creditor chapter plan attorney student cost  
Topic #29: offense crime sentence sentencing guideline element felony criminal risk conviction  

In [None]:
def dict_topic_words(model, feature_names, n_top_words):
    topic_dict={}
    for topic_idx, topic in enumerate(model.components_):
        message = ""
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_dict[topic_idx]=message
    return topic_dict

dict_topwords = dict_topic_words(lda,count_vectorizer.get_feature_names(),5)

# now let's transform our documents to topic-space
print(f"shape before transforming to topic space: {counts.shape}")

doc_topics = lda.transform(counts)

print(f"shape after transforming to topic space: {doc_topics.shape}")

#Printed - pasted below
# shape before transforming to topic space: (827, 15081)
# shape after transforming to topic space: (827, 30)