In [1]:
import pandas as pd
import numpy as np
import PyPDF2
import string
import re
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk import word_tokenize
pd.set_option('display.max_columns', 100)

In [2]:
Cases = pd.read_csv('data/Cases2018.csv', encoding='iso-8859-1')

In [3]:
files = os.listdir('ArgText/')
paths = []
for file in files:
    paths.append('ArgText/'+file)
len(paths)

1114

In [4]:
my_dict = {}
for index, path in enumerate(paths):
    with open(path, "rb") as f:
        dk = []
        read_pdf = PyPDF2.PdfFileReader(f)
        for i in range(0,read_pdf.getNumPages()):
            page = read_pdf.getPage(i)
            page_content = page.extractText()
            dk.append(page_content)
    my_dict[files[index].replace(".pdf", "")] = " ".join(dk)##removes pdf from key and turns value array into string



### pickled above memory expensive dictionary

In [5]:
#my_dict['16-1150']
#my_dict.keys()
import pickle

with open('dictionary.pickle', 'wb') as handle:
    pickle.dump(my_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('dictionary.pickle', 'rb') as handle:
    b = pickle.load(handle)

print(my_dict == b)

True


In [162]:
Labels = Cases[['docket','partyWinning']]
Text = pd.DataFrame.from_dict(my_dict, orient="index")

In [163]:
Text = Text.rename(index=str, columns={0: "Document"})

In [164]:
Text['docket'] = Text.index
Text.reset_index(drop=True)

Unnamed: 0,Document,docket
0,1 IN THE SU...,00-1011
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1021
2,1 2 \n ...,00-1045
3,12345678910\n111213141516171819202122232425 IN...,00-10666
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1072
5,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1073
6,1234\n5\n678\n910\n11\n12\n131415\n161718\n19\...,00-1089
7,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1167
8,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1187
9,1 IN THE SU...,00-121


In [165]:
Text = pd.merge(Text, Labels, on='docket', how='inner')
Text.drop(Text[Text.partyWinning==2.0].index, inplace = True)
Labels = Text[['partyWinning']]
Text.drop('partyWinning', axis=1, inplace= True)

In [89]:
user_defined_stop_words = ['-x','--','x•','n','w','e','washington','x','v','alderson','petitioner','respondent','reporting','company','supreme','court','united','states','states•','•v','•petitioner'] 

i = stopwords.words('english')
j = list(string.punctuation) + user_defined_stop_words

stoplist = set(i).union(j)


In [71]:
#def preprocess(text):
    #return [word.lower() for word in word_tokenize(text) if word.lower() not in stoplist and not word.isdigit()]

In [70]:
#Text['Document'].apply(preprocess)

In [90]:
def preprocess2(data):
    reviews_tokens = []
    for review in data:
        review = review.lower() #Convert to lower-case words
        raw_word_tokens = re.findall(r'(?:\w+)', review,flags = re.UNICODE) #remove pontuaction
        word_tokens = [w for w in raw_word_tokens if w not in stoplist and not w.isdigit()] # do not add stop words
        reviews_tokens.append(word_tokens)
    return reviews_tokens #return all tokens

In [91]:
Text['Tokenized'] = preprocess2(Text.Document)

In [92]:
Text.head()

Unnamed: 0,Document,docket,Tokenized
0,1 IN THE SU...,00-1011,"[deboris, calcano, martinez, et, al, petitione..."
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1021,"[rush, prudential, hmo, inc, deborah, c, moran..."
2,1 2 \n ...,00-1045,"[25in, trw, inc, adelaide, andrews, c, tuesday..."
3,12345678910\n111213141516171819202122232425 IN...,00-10666,"[william, joseph, harris, c, monday, march, en..."
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1072,"[leonard, edelman, lynchburg, college, c, tues..."


In [93]:
#Text['Tokenized_LC'] = Text['Document'].apply(preprocess)

In [94]:
def make_word_bag(data):
    corpus = preprocess2(data)
    bag_of_words = {}
    word_count = 0
    for sentence in corpus:
        for word in sentence:
            if word not in bag_of_words: 
                bag_of_words[word] = word_count #set indexes
                word_count+=1
    return bag_of_words #index of letters

bag_of_words = make_word_bag(Text.Document)

In [95]:
def features(sentence_tokens,bag_of_words):
    sentence_features = [0 for x in range(len(bag_of_words))]
 
    for word in sentence_tokens:
        index = bag_of_words[word]
        sentence_features[index] +=1
    return sentence_features

In [96]:
def get_rev_features(data,bag_of_words):
    review_features = []
    reviews_text_tokens = preprocess2(data)
    for review_text in reviews_text_tokens:
        feature_review_text = features(review_text,bag_of_words)
        review_features.append(feature_review_text)
    return review_features
 
rev_features = get_rev_features(Text.Document,bag_of_words)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(rev_features, Labels, test_size=0.2, random_state=23)
y_test = np.ravel(y_test)
y_train = np.ravel(y_train)

In [103]:
clf = SGDClassifier(loss='log', penalty='l2',alpha=.01, random_state=23, n_jobs=-1, n_iter = 1200)

In [104]:
mod2 =  clf.fit(X_train, y_train)
predicted_svm = mod2.predict(X_test)
score2 = metrics.accuracy_score(y_test, predicted_svm)
print("accuracy:   %0.3f" % score2)



accuracy:   0.625


In [117]:
max(X_test[0])

162

In [111]:
predicted_svm

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1.,
       0., 0., 0., 0., 1.

In [100]:
from sklearn.metrics import confusion_matrix

In [105]:
tn, fp, fn, tp = confusion_matrix(y_test, predicted_svm).ravel()

In [106]:
(tn, fp, fn, tp)

(24, 60, 37, 138)

In [110]:
bag_of_words

{'deboris': 0,
 'calcano': 1,
 'martinez': 2,
 'et': 3,
 'al': 4,
 'petitioners': 5,
 'immigration': 6,
 'naturalization': 7,
 'service': 8,
 'c': 9,
 'tuesday': 10,
 'april': 11,
 'entitled': 12,
 'matter': 13,
 'came': 14,
 'oral': 15,
 'argument': 16,
 'appearances': 17,
 'lucas': 18,
 'guttentag': 19,
 'esq': 20,
 'new': 21,
 'york': 22,
 'behalf': 23,
 'edwin': 24,
 'kneedler': 25,
 'deputy': 26,
 'solicitor': 27,
 'general': 28,
 'department': 29,
 'justice': 30,
 'inc': 31,
 'fourteenth': 32,
 'street': 33,
 'suite': 34,
 'depo': 35,
 'page': 36,
 'rebuttal': 37,
 'p': 38,
 'r': 39,
 'g': 40,
 'chief': 41,
 'rehnquist': 42,
 'hear': 43,
 'number': 44,
 'mr': 45,
 'may': 46,
 'please': 47,
 'jurisdictional': 48,
 'issue': 49,
 'presented': 50,
 'case': 51,
 'whether': 52,
 'legal': 53,
 'ruling': 54,
 'attorney': 55,
 'pure': 56,
 'question': 57,
 'law': 58,
 'compelling': 59,
 'deportation': 60,
 'long': 61,
 'time': 62,
 'permanent': 63,
 'residents': 64,
 'reviewable': 65,
 'n

In [118]:
pred_proba_df = pd.DataFrame(clf.predict_proba(X_test))
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
    test_accuracy = metrics.accuracy_score(y_test.reshape(y_test.size,1),
                                           Y_test_pred.iloc[:,1].values.reshape(Y_test_pred.iloc[:,1].values.size,1))
    print('The testing accuracy is {}'.format(test_accuracy))

    print(confusion_matrix(y_test.reshape(y_test.size,1),
                           Y_test_pred.iloc[:,1].values.reshape(Y_test_pred.iloc[:,1].values.size,1)))



******** For i = 0.05 ******
The testing accuracy is 0.6332046332046332
[[ 15  69]
 [ 26 149]]

******** For i = 0.1 ******
The testing accuracy is 0.640926640926641
[[ 20  64]
 [ 29 146]]

******** For i = 0.15 ******
The testing accuracy is 0.637065637065637
[[ 21  63]
 [ 31 144]]

******** For i = 0.2 ******
The testing accuracy is 0.6332046332046332
[[ 21  63]
 [ 32 143]]

******** For i = 0.25 ******
The testing accuracy is 0.6332046332046332
[[ 23  61]
 [ 34 141]]

******** For i = 0.3 ******
The testing accuracy is 0.6332046332046332
[[ 23  61]
 [ 34 141]]

******** For i = 0.35 ******
The testing accuracy is 0.6293436293436293
[[ 23  61]
 [ 35 140]]

******** For i = 0.4 ******
The testing accuracy is 0.6293436293436293
[[ 23  61]
 [ 35 140]]

******** For i = 0.45 ******
The testing accuracy is 0.6254826254826255
[[ 23  61]
 [ 36 139]]

******** For i = 0.5 ******
The testing accuracy is 0.6254826254826255
[[ 24  60]
 [ 37 138]]

******** For i = 0.55 ******
The testing accur

In [122]:
Text.drop('Tokenized', axis=1, inplace =True)

In [123]:
Text.head()

Unnamed: 0,Document,docket
0,1 IN THE SU...,00-1011
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1021
2,1 2 \n ...,00-1045
3,12345678910\n111213141516171819202122232425 IN...,00-10666
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,00-1072


In [124]:
Text['word_count'] = Text['Document'].apply(lambda x: len(str(x).split(" ")))

In [127]:
Text[['Document', 'word_count']].head()

Unnamed: 0,Document,word_count
0,1 IN THE SU...,48034
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,9787
2,1 2 \n ...,36279
3,12345678910\n111213141516171819202122232425 IN...,9936
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,8604


In [128]:
Text['char_count'] = Text['Document'].str.len() ## this also includes spaces
Text[['Document', 'char_count']].head()

Unnamed: 0,Document,char_count
0,1 IN THE SU...,109867
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,67549
2,1 2 \n ...,101643
3,12345678910\n111213141516171819202122232425 IN...,65714
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,57756


In [145]:
def avg_word(sentence):
    words = sentence.split(" ")
    return (sum(len(word) for word in words)/len(words))

Text['avg_word'] = Text['Document'].apply(lambda x: avg_word(x))
Text[['Document','avg_word']].head()

Unnamed: 0,Document,avg_word
0,1 IN THE SU...,1.287296
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,5.902013
2,1 2 \n ...,1.801731
3,12345678910\n111213141516171819202122232425 IN...,5.613829
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,5.712808


In [132]:
Text[Text['word_count']==48034]

Unnamed: 0,Document,docket,word_count,char_count
0,1 IN THE SU...,00-1011,48034,109867


In [146]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

Text['stopwords'] = Text['Document'].apply(lambda x: len([x for x in x.split(" ") if x in stop]))
Text[['Document','stopwords']].head()

Unnamed: 0,Document,stopwords
0,1 IN THE SU...,4189
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,3632
2,1 2 \n ...,4292
3,12345678910\n111213141516171819202122232425 IN...,3705
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,3134


In [147]:
Text['numerics'] = Text['Document'].apply(lambda x: len([x for x in x.split(" ") if x.isdigit()]))
Text[['Document','numerics']].head()

Unnamed: 0,Document,numerics
0,1 IN THE SU...,1387
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,180
2,1 2 \n ...,1444
3,12345678910\n111213141516171819202122232425 IN...,200
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,164


In [149]:
Text['upper'] = Text['Document'].apply(lambda x: len([x for x in x.split(" ") if x.isupper()]))
Text[['Document','upper']].head()

Unnamed: 0,Document,upper
0,1 IN THE SU...,1218
1,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,617
2,1 2 \n ...,1000
3,12345678910\n111213141516171819202122232425 IN...,562
4,12\n3\n4\n5\n6\n7\n8\n910\n11\n12\n13\n14\n15\...,573


### Translate all to lowercase

In [157]:
Text['Document'] = Text['Document'].apply(lambda x: " ".join(x.lower() for x in x.split()))
Text['Document'].head()

0    1 in the supreme court of the united states 2 ...
1    12 3 4 5 6 7 8 910 11 12 13 14 15 16 17 18 19 ...
2    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3    12345678910 111213141516171819202122232425 in ...
4    12 3 4 5 6 7 8 910 11 12 13 14 15 16 17 18 19 ...
Name: Document, dtype: object

In [158]:
Text['Document'] = Text['Document'].str.replace('[^\w\s]','')
Text['Document'].head()

0    1 in the supreme court of the united states 2 ...
1    12 3 4 5 6 7 8 910 11 12 13 14 15 16 17 18 19 ...
2    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3    12345678910 111213141516171819202122232425 in ...
4    12 3 4 5 6 7 8 910 11 12 13 14 15 16 17 18 19 ...
Name: Document, dtype: object

1293

1293