In [1]:
import numpy as np
import pandas as pd
import statistics

In [2]:
stop_words = ['i',
'me',
'my',
'myself',
'we',
'our',
'ours',
'ourselves',
'you',
'your',
'yours',
'yourself',
'yourselves',
'he',
'him',
'his',
'himself',
'she',
'her',
'hers',
'herself',
'it',
'its',
'itself',
'they',
'them',
'their',
'theirs',
'themselves',
'what',
'which',
'who',
'whom',
'this',
'that',
'these',
'those',
'am',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'having',
'do',
'does',
'did',
'doing',
'a',
'an',
'the',
'and',
'but',
'if',
'or',
'because',
'as',
'until',
'while',
'of',
'at',
'by',
'for',
'with',
'about',
'against',
'between',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'to',
'from',
'up',
'down',
'in',
'out',
'on',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'any',
'both',
'each',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
'can',
'will',
'just',
'don',
'should',
'now']

In [3]:
def fit(x_train, y_train) :
    result = {}
    class_values = set(y_train)
    for curr in class_values :
        result[curr] = {}
        result["total_data"] = len(y_train)
        current_class_rows = (y_train == curr)
        x_train_current = x_train[current_class_rows]
        y_train_current = y_train[current_class_rows]
        
        feat_count = x_train.shape[1] # (rows, cols)
        result[curr]["total_count"] = len(y_train_current)
        for j in range(1, feat_count+1) :
            result[curr][j] = {}
            all_possible_values = set(x_train[:, j-1])
            for curr_value in all_possible_values :
                result[curr][j][curr_value] = (x_train_current[:, j-1] == curr_value).sum()
    return result

In [4]:
def probability(dictionary, x, curr_class) :
    output = np.log(dictionary[curr_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[curr_class].keys()) - 1 # due to "total_count" key
    for j in range(1, num_features+1) :
        xj = x[j-1]
        curr_class_with_value_xj = dictionary[curr_class][j][xj] + 1 # laplace correction
        count_curr_class = dictionary[curr_class]["total_count"] + len(dictionary[curr_class][j].keys())
        curr_xj_prob = np.log(curr_class_with_value_xj) - np.log(count_curr_class)
        output = output + curr_xj_prob
    return output

In [5]:
def predictSinglePoint(dictionary, x) :
    classes = dictionary.keys() # setosa, virgin.., versi, "total_data"
    best_prob = -1
    best_class = -1
    firstrun = True
    for curr_class in classes :
        if(curr_class == "total_data") :
            continue
        p_curr_class = probability(dictionary, x, curr_class) # dictionary[curr_class]["total_count"]
        if(firstrun or p_curr_class > best_prob) :
            best_prob = p_curr_class
            best_class = curr_class
        firstrun = False
    return best_class

In [6]:
def predict(dictionary, x_test) :
    y_pred = []
    for x in x_test :
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [7]:
df = pd.read_csv("justice.csv")
# df=df.replace(np.nan,'hello how are you')
df = df.drop(['Unnamed: 0', 'ID', 'name', 'href', 'docket', 'first_party', 'second_party', 'facts_len'], axis=1)

fill_val = (statistics.mode(df['issue_area']))
df['issue_area'] = df['issue_area'].fillna(fill_val)

fill_val = (statistics.mode(df['disposition']))
df['disposition'] = df['disposition'].fillna(fill_val)

fill_val = (statistics.mode(df['decision_type']))
df['decision_type'] = df['decision_type'].fillna(fill_val)

fill_val = (statistics.mode(df['first_party_winner']))
df['first_party_winner'] = df['first_party_winner'].fillna(fill_val)

In [9]:
xyz = set(df.disposition)
xyz

{'affirmed',
 'none',
 'reversed',
 'reversed in-part',
 'reversed in-part/remanded',
 'reversed/remanded',
 'vacated',
 'vacated in-part/remanded',
 'vacated/remanded'}

In [15]:
y_train = np.array(df.decision_type)
x_train = df.drop('decision_type', axis=1)
x_train = np.array(x_train)
for i in range (0, 3303, 1) :
    x_train[i][1] = (x_train[i][1])[3:-5]
x_data = x_train[:, 1]

In [11]:
diction = {}
k = 1000
for it in x_data :
    currlist = it.split(" ")
    for itr in currlist :
        if(itr in stop_words) :
            continue
        if(itr in diction.keys()) :
            diction[itr] += 1
        else :
            diction[itr] = 1

diction = dict(sorted(diction.items(), key=lambda item: item[1], reverse=True))
features = dict(list(diction.items())[0: k])
feat_keys = list(features.keys())

idx = {}
for i in range(0, len(feat_keys)) :
    idx[feat_keys[i]] = i


x_train = []

for it in x_data :
    currlist = (it.split(" "))
    curr_x_train = [0]*1000
    for itr in feat_keys :
        curr_x_train[idx[itr]] = currlist.count(itr)
    x_train.append(curr_x_train)

x_train = np.array(x_train)

In [17]:
dictionary = fit(x_train, y_train)

In [13]:
y_pred = predict(dictionary, x_train)
y_pred

['majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'per curiam',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'per curiam',
 'majority opinion',
 'majority opinion',
 'per curiam',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'per curiam',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'per curiam',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 'majority opinion',
 

In [14]:
from sklearn.metrics import accuracy_score 
accuracy_score (y_train,y_pred)

0.8383287920072662