In [6]:
import pandas as pd
import spacy
import numpy as np
import re
import os
import pickle
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, chi2
from numpy import random
from scipy import sparse
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
random.randint(1, 79, 10)

array([48, 67, 55, 23,  8, 72, 44, 27, 16, 63])

In [8]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bingbo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
#stopwords = set("et al v. s.ct u.s. opinion dissent dissenting concur concurring alito breyer ginsburg gorsuch kagan kavanaugh kennedy o'Connor per curiam rehnquist roberts scalia sotomayor souter stevens thomas".split())
import nltk
nltk.download('stopwords')
stopwords_ = set(stopwords.words('english'))
spaces = re.compile('[\r\n\t\s]+')
numbers = re.compile('[0-9]')
                
def clean(text):
    return ' '.join([token.lemma_.lower() for token in text
            if token.lemma_.lower() not in stopwords_ 
            and not token.is_stop 
            and not token.is_punct
            and token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'X'}
            and len(token.lemma_.lower()) > 1
           ])

def insights(X_sel, y, selector, N=10):
    classifier_sel = LogisticRegression(class_weight='balanced',
                                        solver='saga',
                                        penalty='elasticnet', 
                                        l1_ratio=0.5, 
                                        n_jobs=-1).fit(X_sel, y)
    # get the names of the features
    features = vectorizer.get_feature_names()
    num_feats = len(features)
    reduced_size = classifier_sel.coef_.shape[1]

    # get the indices of the selection
    top_scores = selector.scores_.argsort()[-num_feats:]

    # sort feature names
    best_indicator_terms = [features[i] for i in sorted(top_scores)] 

    # get class with highest weight for each feature
    top_class = [classifier_sel.classes_[c] for c in classifier_sel.coef_.argmax(axis=0)]

    # make DataFrame
    top_indicator_scores = pd.DataFrame(data={'feature': best_indicator_terms[:reduced_size], 
                                              'class': top_class[:reduced_size],
                                              'coefficient': classifier_sel.coef_.max(axis=0)})

    # sort in descending order
    top_indicator_scores.sort_values(['class', 'coefficient'], ascending=False, inplace=True)
    return top_indicator_scores.groupby('class').head(N)

def evaluate(X, y, selector):
    c_dumb, c_lr = {}, {}
    kf = KFold(n_splits=5)
    
    if not(selector is None):
        X_sel = selector.fit_transform(X, y)
    else:
        X_sel = X

    most_frequent = DummyClassifier(strategy='most_frequent')
    most_frequent.fit(X_sel, y)
    # get the performance on the development set
    dumb_predictions = most_frequent.predict(X_sel)
    print("DUMB predictor")
    print(classification_report(y, dumb_predictions))
    c_dumb['prec'], c_dumb['rec'], c_dumb['f1'], _ = precision_recall_fscore_support(y, dumb_predictions, average='macro')
    c_dumb['acc'] = accuracy_score(y, dumb_predictions)
    
    preds = cross_val_predict(LogisticRegression(class_weight='balanced'), X_sel, y, cv=5)
    print(classification_report(y, preds))
    c_lr['prec'], c_lr['rec'], c_lr['f1'], _ = precision_recall_fscore_support(y, preds, average='macro')
    c_lr['acc'] = accuracy_score(y, preds)

    return c_dumb, c_lr

def evaluate_split(X, y, selector):
    c_dumb = {}

    kf = KFold(n_splits=5)
    
    if not(selector is None):
        X_sel = selector.fit_transform(X, y)
    else:
        X_sel = X

    most_frequent = DummyClassifier(strategy='most_frequent')
    most_frequent.fit(X_sel, y)
    # get the performance on the development set
    dumb_predictions = most_frequent.predict(X_sel)
    print("DUMB predictor")
    print(classification_report(y, dumb_predictions))
    
    
    
    forest = RandomForestClassifier(random_state=1)
    multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        y_preds = multi_target_forest.fit(X_train, y_train).predict(X_test)
        
        print(classification_report(y_test, preds))

    return X_sel

  spaces = re.compile('[\r\n\t\s]+')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bingbo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [17]:
text_columns = ['description','title','publicationname']

taxonomy = "vigna"

if taxonomy=="vigna":
    target_fields = ['micro', 'theory', 'macro', 'labor', 'econom', 'industrial', 'international', 'finance', 'public',
          'healthurban', 'history', 'develop', 'lab', 'other']
elif taxonomy=='os':
    target_fields = ['OS_econom', 'OS_labor_health', 'OS_micro', 'OS_envir_agric',
       'OS_devel_intern_growth', 'OS_macro_finance', 'OS_IO', 'OS_public',
       'OS_history', 'OS_other']
elif taxonomy=="beneito":
    target_fields=['B_ldisc', 'B_wellbeing', 'B_demo',
       'B_general', 'B_labor', 'B_historyeco', 'B_health', 'B_education',
       'B_gamet', 'B_hhbehaviour', 'B_develop', 'B_business', 'B_wages',
       'B_naturalr', 'B_public', 'B_demandlab', 'B_trade', 'B_industrial',
       'B_intern', 'B_welfare', 'B_urbanregional', 'B_ecohistory',
       'B_lawecon', 'B_collectivedec', 'B_financial', 'B_macroconsum',
       'B_macropolicy', 'B_prod', 'B_inf', 'B_macromoney',
       'B_macroprices', 'B_math', 'B_other']



if os.path.exists("data/pub.pkl"):
    filehandler = open("data/pub.pkl", 'rb') 
    df = pickle.load(filehandler)
else:
    df = pd.read_stata("data\pub_all_Nov23 (2).dta")

    # select text column
    df['TEXT'] = df['description']
    df = df[df.TEXT != '']

    # clean text
    no_space = df.TEXT.apply(lambda x: re.sub(spaces, ' ', x)).apply(lambda x: re.sub(numbers, '0', x))
    clean_texts = []
    for text in nlp.pipe(no_space):
        clean_texts.append(clean(text)) 

    df['CLEAN_TEXT'] = clean_texts

    # save
    with open("data/pub.pkl", 'wb') as file:
        pickle.dump(df, file)

  df = pd.read_stata("data\pub_all_Nov23 (2).dta")


In [63]:
for c in df.columns:
    print('*',c,'*')
    print(df[c].iloc[1])

* auth_id *
11240938700.0
* cognome *
ABATEMARCO
* initials *
A
* nome *
Antonio
* affiliation *
UniversitÌåÊ di Salerno
* documents *
12.0
* affiliation_id *
60007061.0
* city *
Salerno
* country *
Italy
* areas *
ECON (11); SOCI (11); DECI (2)
* areas2 *
nan
* dup2 *
0.0
* merge3 *
matched (3)
* eid *
2-s2.0-85077148210
* doi *
10.1007/s12232-019-00342-9
* pii *

* pubmed_id *
nan
* title *
Equality of opportunity in health care: access and equal access revisited
* subtype *
ar
* subtypedescription *
Article
* creator *
Abatemarco A.
* afid *
60017293;60007061
* affilname *
UniversitÌÊ degli Studi di Napoli Federico II;UniversitÌÊ degli Studi di Salerno
* affiliation_city *
Naples;Salerno
* affiliation_country *
Italy;Italy
* author_count *
3.0
* author_names *
Abatemarco, Antonio;Beraldo, Sergio;Stroffolini, Francesca
* author_ids *
11240938700;16551797700;7004595039
* author_afids *
60007061;60017293;60017293
* coverdate *
2020-03-01
* coverdisplaydate *
1 March 2020
* publication

## Split data

In [7]:
train = df[df['JEL'] != "-"]
test = df[df['JEL'] == "-"]

#we do cross validation
#train, val = train_test_split(labeled_data, test_size=0.2, random_state=42)

print("Train Size:",len(train),len(train)/len(df)*100)
print("Test Size:",len(test),len(test)/len(df)*100)
#print("Val Size:",len(val),len(val)/len(df)*100)


y = sparse.csr_matrix(train[target_fields])
#y = train[target_fields]


Train Size: 9087 24.735953832752614
Test Size: 27649 75.26404616724739


In [8]:
len(df)

36736

# Classify data

In [65]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=0.01, max_df=0.5, sublinear_tf=True)
X_train = vectorizer.fit_transform(train['CLEAN_TEXT'])
X_test = vectorizer.transform(test['CLEAN_TEXT'])
#X_val = vectorizer.transform(val['CLEAN_TEXT'])
print(X_train.shape,X_test.shape)

(9042, 1469) (27488, 1469)


In [66]:
selector = SelectKBest(chi2, k=1000)
#selector = None 

In [67]:
results = {}
results_dumb = {}
insight = {}

for t in target_fields:
    y_sel = train[t].astype(int)

    results_dumb[t], results[t] = evaluate(X_train, y_sel, selector)
    insight[t] = insights(X_train, y_sel, selector, 30)

DUMB predictor
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9003
           1       0.00      0.00      0.00        39

    accuracy                           1.00      9042
   macro avg       0.50      0.50      0.50      9042
weighted avg       0.99      1.00      0.99      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9003
           1       0.42      0.72      0.53        39

    accuracy                           0.99      9042
   macro avg       0.71      0.86      0.76      9042
weighted avg       1.00      0.99      1.00      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8845
           1       0.00      0.00      0.00       197

    accuracy                           0.98      9042
   macro avg       0.49      0.50      0.49      9042
weighted avg       0.96      0.98      0.97      9042

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8845
           1       0.34      0.77      0.47       197

    accuracy                           0.96      9042
   macro avg       0.67      0.87      0.72      9042
weighted avg       0.98      0.96      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      8625
           1       0.00      0.00      0.00       417

    accuracy                           0.95      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.91      0.95      0.93      9042

              precision    recall  f1-score   support

           0       0.99      0.94      0.97      8625
           1       0.41      0.83      0.55       417

    accuracy                           0.94      9042
   macro avg       0.70      0.89      0.76      9042
weighted avg       0.96      0.94      0.95      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8893
           1       0.00      0.00      0.00       149

    accuracy                           0.98      9042
   macro avg       0.49      0.50      0.50      9042
weighted avg       0.97      0.98      0.98      9042

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      8893
           1       0.33      0.73      0.45       149

    accuracy                           0.97      9042
   macro avg       0.66      0.85      0.72      9042
weighted avg       0.98      0.97      0.98      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8566
           1       0.00      0.00      0.00       476

    accuracy                           0.95      9042
   macro avg       0.47      0.50      0.49      9042
weighted avg       0.90      0.95      0.92      9042

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      8566
           1       0.42      0.84      0.56       476

    accuracy                           0.93      9042
   macro avg       0.71      0.89      0.76      9042
weighted avg       0.96      0.93      0.94      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8877
           1       0.00      0.00      0.00       165

    accuracy                           0.98      9042
   macro avg       0.49      0.50      0.50      9042
weighted avg       0.96      0.98      0.97      9042

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8877
           1       0.31      0.72      0.43       165

    accuracy                           0.97      9042
   macro avg       0.65      0.85      0.71      9042
weighted avg       0.98      0.97      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      8632
           1       0.00      0.00      0.00       410

    accuracy                           0.95      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.91      0.95      0.93      9042

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8632
           1       0.58      0.87      0.69       410

    accuracy                           0.97      9042
   macro avg       0.79      0.92      0.84      9042
weighted avg       0.97      0.97      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8713
           1       0.00      0.00      0.00       329

    accuracy                           0.96      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.93      0.96      0.95      9042

              precision    recall  f1-score   support

           0       1.00      0.97      0.99      8713
           1       0.57      0.90      0.70       329

    accuracy                           0.97      9042
   macro avg       0.78      0.94      0.84      9042
weighted avg       0.98      0.97      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      8108
           1       0.00      0.00      0.00       934

    accuracy                           0.90      9042
   macro avg       0.45      0.50      0.47      9042
weighted avg       0.80      0.90      0.85      9042

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8108
           1       0.62      0.86      0.72       934

    accuracy                           0.93      9042
   macro avg       0.80      0.90      0.84      9042
weighted avg       0.94      0.93      0.93      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8716
           1       0.00      0.00      0.00       326

    accuracy                           0.96      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.93      0.96      0.95      9042

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8716
           1       0.37      0.83      0.51       326

    accuracy                           0.94      9042
   macro avg       0.68      0.89      0.74      9042
weighted avg       0.97      0.94      0.95      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.79      1.00      0.89      7178
           1       0.00      0.00      0.00      1864

    accuracy                           0.79      9042
   macro avg       0.40      0.50      0.44      9042
weighted avg       0.63      0.79      0.70      9042

              precision    recall  f1-score   support

           0       0.94      0.88      0.91      7178
           1       0.64      0.79      0.71      1864

    accuracy                           0.86      9042
   macro avg       0.79      0.84      0.81      9042
weighted avg       0.88      0.86      0.87      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8837
           1       0.00      0.00      0.00       205

    accuracy                           0.98      9042
   macro avg       0.49      0.50      0.49      9042
weighted avg       0.96      0.98      0.97      9042

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8837
           1       0.27      0.66      0.38       205

    accuracy                           0.95      9042
   macro avg       0.63      0.81      0.68      9042
weighted avg       0.98      0.95      0.96      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      8781
           1       0.00      0.00      0.00       261

    accuracy                           0.97      9042
   macro avg       0.49      0.50      0.49      9042
weighted avg       0.94      0.97      0.96      9042

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      8781
           1       0.42      0.87      0.57       261

    accuracy                           0.96      9042
   macro avg       0.71      0.92      0.77      9042
weighted avg       0.98      0.96      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      8091
           1       0.00      0.00      0.00       951

    accuracy                           0.89      9042
   macro avg       0.45      0.50      0.47      9042
weighted avg       0.80      0.89      0.85      9042

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      8091
           1       0.73      0.87      0.79       951

    accuracy                           0.95      9042
   macro avg       0.86      0.91      0.88      9042
weighted avg       0.96      0.95      0.95      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.86      1.00      0.92      7736
           1       0.00      0.00      0.00      1306

    accuracy                           0.86      9042
   macro avg       0.43      0.50      0.46      9042
weighted avg       0.73      0.86      0.79      9042

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      7736
           1       0.57      0.79      0.67      1306

    accuracy                           0.89      9042
   macro avg       0.77      0.85      0.80      9042
weighted avg       0.91      0.89      0.89      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8563
           1       0.00      0.00      0.00       479

    accuracy                           0.95      9042
   macro avg       0.47      0.50      0.49      9042
weighted avg       0.90      0.95      0.92      9042

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      8563
           1       0.37      0.81      0.51       479

    accuracy                           0.92      9042
   macro avg       0.68      0.87      0.73      9042
weighted avg       0.96      0.92      0.93      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8428
           1       0.00      0.00      0.00       614

    accuracy                           0.93      9042
   macro avg       0.47      0.50      0.48      9042
weighted avg       0.87      0.93      0.90      9042

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      8428
           1       0.51      0.84      0.63       614

    accuracy                           0.93      9042
   macro avg       0.75      0.89      0.80      9042
weighted avg       0.96      0.93      0.94      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      7380
           1       0.00      0.00      0.00      1662

    accuracy                           0.82      9042
   macro avg       0.41      0.50      0.45      9042
weighted avg       0.67      0.82      0.73      9042

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      7380
           1       0.59      0.80      0.68      1662

    accuracy                           0.86      9042
   macro avg       0.77      0.84      0.80      9042
weighted avg       0.88      0.86      0.87      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8408
           1       0.00      0.00      0.00       634

    accuracy                           0.93      9042
   macro avg       0.46      0.50      0.48      9042
weighted avg       0.86      0.93      0.90      9042

              precision    recall  f1-score   support

           0       0.98      0.92      0.95      8408
           1       0.42      0.79      0.55       634

    accuracy                           0.91      9042
   macro avg       0.70      0.85      0.75      9042
weighted avg       0.94      0.91      0.92      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8674
           1       0.00      0.00      0.00       368

    accuracy                           0.96      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.92      0.96      0.94      9042

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      8674
           1       0.29      0.78      0.42       368

    accuracy                           0.91      9042
   macro avg       0.64      0.85      0.69      9042
weighted avg       0.96      0.91      0.93      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8477
           1       0.00      0.00      0.00       565

    accuracy                           0.94      9042
   macro avg       0.47      0.50      0.48      9042
weighted avg       0.88      0.94      0.91      9042

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      8477
           1       0.46      0.80      0.59       565

    accuracy                           0.93      9042
   macro avg       0.73      0.87      0.77      9042
weighted avg       0.95      0.93      0.94      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8888
           1       0.00      0.00      0.00       154

    accuracy                           0.98      9042
   macro avg       0.49      0.50      0.50      9042
weighted avg       0.97      0.98      0.97      9042

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8888
           1       0.27      0.68      0.38       154

    accuracy                           0.96      9042
   macro avg       0.63      0.82      0.68      9042
weighted avg       0.98      0.96      0.97      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8718
           1       0.00      0.00      0.00       324

    accuracy                           0.96      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.93      0.96      0.95      9042

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8718
           1       0.35      0.75      0.48       324

    accuracy                           0.94      9042
   macro avg       0.67      0.85      0.72      9042
weighted avg       0.97      0.94      0.95      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8526
           1       0.00      0.00      0.00       516

    accuracy                           0.94      9042
   macro avg       0.47      0.50      0.49      9042
weighted avg       0.89      0.94      0.92      9042

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      8526
           1       0.43      0.80      0.56       516

    accuracy                           0.93      9042
   macro avg       0.71      0.87      0.76      9042
weighted avg       0.96      0.93      0.94      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      7875
           1       0.00      0.00      0.00      1167

    accuracy                           0.87      9042
   macro avg       0.44      0.50      0.47      9042
weighted avg       0.76      0.87      0.81      9042

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      7875
           1       0.62      0.83      0.71      1167

    accuracy                           0.91      9042
   macro avg       0.80      0.88      0.83      9042
weighted avg       0.93      0.91      0.92      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      8601
           1       0.00      0.00      0.00       441

    accuracy                           0.95      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.90      0.95      0.93      9042

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      8601
           1       0.32      0.76      0.45       441

    accuracy                           0.91      9042
   macro avg       0.65      0.84      0.70      9042
weighted avg       0.95      0.91      0.93      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      8676
           1       0.00      0.00      0.00       366

    accuracy                           0.96      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.92      0.96      0.94      9042

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      8676
           1       0.43      0.83      0.57       366

    accuracy                           0.95      9042
   macro avg       0.71      0.89      0.77      9042
weighted avg       0.97      0.95      0.96      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      8201
           1       0.00      0.00      0.00       841

    accuracy                           0.91      9042
   macro avg       0.45      0.50      0.48      9042
weighted avg       0.82      0.91      0.86      9042

              precision    recall  f1-score   support

           0       0.98      0.89      0.93      8201
           1       0.42      0.82      0.56       841

    accuracy                           0.88      9042
   macro avg       0.70      0.85      0.74      9042
weighted avg       0.93      0.88      0.90      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      8037
           1       0.00      0.00      0.00      1005

    accuracy                           0.89      9042
   macro avg       0.44      0.50      0.47      9042
weighted avg       0.79      0.89      0.84      9042

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      8037
           1       0.45      0.81      0.58      1005

    accuracy                           0.87      9042
   macro avg       0.71      0.84      0.75      9042
weighted avg       0.92      0.87      0.89      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8597
           1       0.00      0.00      0.00       445

    accuracy                           0.95      9042
   macro avg       0.48      0.50      0.49      9042
weighted avg       0.90      0.95      0.93      9042

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8597
           1       0.51      0.87      0.64       445

    accuracy                           0.95      9042
   macro avg       0.75      0.92      0.81      9042
weighted avg       0.97      0.95      0.96      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8411
           1       0.00      0.00      0.00       631

    accuracy                           0.93      9042
   macro avg       0.47      0.50      0.48      9042
weighted avg       0.87      0.93      0.90      9042

              precision    recall  f1-score   support

           0       0.99      0.92      0.96      8411
           1       0.46      0.85      0.59       631

    accuracy                           0.92      9042
   macro avg       0.72      0.89      0.77      9042
weighted avg       0.95      0.92      0.93      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      6661
           1       0.00      0.00      0.00      2381

    accuracy                           0.74      9042
   macro avg       0.37      0.50      0.42      9042
weighted avg       0.54      0.74      0.62      9042

              precision    recall  f1-score   support

           0       0.90      0.81      0.85      6661
           1       0.59      0.75      0.66      2381

    accuracy                           0.79      9042
   macro avg       0.74      0.78      0.75      9042
weighted avg       0.82      0.79      0.80      9042



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DUMB predictor
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      7016
           1       0.00      0.00      0.00      2026

    accuracy                           0.78      9042
   macro avg       0.39      0.50      0.44      9042
weighted avg       0.60      0.78      0.68      9042

              precision    recall  f1-score   support

           0       0.90      0.78      0.83      7016
           1       0.47      0.69      0.56      2026

    accuracy                           0.76      9042
   macro avg       0.68      0.73      0.70      9042
weighted avg       0.80      0.76      0.77      9042





In [76]:
pd.set_option("display.max_columns", None)
pd.DataFrame.from_dict(results_dumb)

Unnamed: 0,B_ldisc,B_wellbeing,B_demo,B_general,B_labor,B_historyeco,B_health,B_education,B_gamet,B_hhbehaviour,B_develop,B_business,B_wages,B_naturalr,B_public,B_demandlab,B_trade,B_industrial,B_intern,B_welfare,B_urbanregional,B_ecohistory,B_lawecon,B_collectivedec,B_financial,B_macroconsum,B_macropolicy,B_prod,B_inf,B_macromoney,B_macroprices,B_math,B_other
prec,0.497843,0.489106,0.476941,0.491761,0.473678,0.490876,0.477328,0.481807,0.448352,0.481973,0.396925,0.488664,0.485567,0.447412,0.427781,0.473512,0.466047,0.408096,0.464941,0.479651,0.468757,0.491484,0.482084,0.471466,0.435468,0.475614,0.479761,0.453495,0.444426,0.475393,0.465107,0.368337,0.387967
rec,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
f1,0.498919,0.494493,0.488198,0.495846,0.486483,0.495396,0.488401,0.490735,0.47277,0.490821,0.44254,0.494267,0.492678,0.472247,0.46108,0.486396,0.482427,0.449397,0.481834,0.489614,0.483875,0.495706,0.490878,0.485314,0.465508,0.487502,0.489672,0.475613,0.470578,0.487386,0.481923,0.424186,0.436916
acc,0.995687,0.978213,0.953882,0.983521,0.947357,0.981752,0.954656,0.963614,0.896704,0.963946,0.793851,0.977328,0.971135,0.894824,0.855563,0.947025,0.932095,0.816191,0.929883,0.959301,0.937514,0.982968,0.964167,0.942933,0.870936,0.951228,0.959522,0.90699,0.888852,0.950785,0.930215,0.736673,0.775935


In [77]:
results = pd.DataFrame.from_dict(results)
results

Unnamed: 0,B_ldisc,B_wellbeing,B_demo,B_general,B_labor,B_historyeco,B_health,B_education,B_gamet,B_hhbehaviour,B_develop,B_business,B_wages,B_naturalr,B_public,B_demandlab,B_trade,B_industrial,B_intern,B_welfare,B_urbanregional,B_ecohistory,B_lawecon,B_collectivedec,B_financial,B_macroconsum,B_macropolicy,B_prod,B_inf,B_macromoney,B_macroprices,B_math,B_other
prec,0.708342,0.665475,0.702937,0.66039,0.705686,0.650298,0.785757,0.784331,0.798827,0.681532,0.789265,0.629816,0.707617,0.857875,0.768326,0.679431,0.748282,0.770121,0.700761,0.640046,0.72503,0.631132,0.671766,0.709897,0.798321,0.653193,0.713738,0.701339,0.713472,0.750533,0.72224,0.742686,0.68339
rec,0.856808,0.866403,0.888744,0.853065,0.887072,0.845398,0.919086,0.937166,0.896896,0.89056,0.837728,0.810659,0.918788,0.914565,0.846928,0.8674,0.889896,0.837947,0.851653,0.850666,0.869211,0.821686,0.850907,0.870171,0.878691,0.836393,0.892537,0.852855,0.84351,0.915152,0.88539,0.778882,0.732667
f1,0.76276,0.723883,0.76018,0.71765,0.761409,0.705788,0.837775,0.842514,0.837784,0.740905,0.808648,0.67805,0.773131,0.883326,0.798316,0.731535,0.798621,0.795034,0.747747,0.688138,0.774507,0.682362,0.724957,0.761717,0.830675,0.700121,0.77202,0.744153,0.752229,0.808391,0.774191,0.754821,0.695501
acc,0.99447,0.961955,0.937956,0.970582,0.930768,0.965052,0.965273,0.971909,0.929772,0.942712,0.863747,0.951228,0.961402,0.952555,0.885092,0.916833,0.934085,0.86065,0.908538,0.913183,0.929772,0.963061,0.941716,0.928556,0.913294,0.909754,0.949458,0.87923,0.870383,0.952112,0.919044,0.794293,0.75647


In [78]:
ins_df = pd.DataFrame(columns = target_fields)
for t in target_fields:
    ins_df[t] = insight[t]['feature'].values
ins_df

Unnamed: 0,B_ldisc,B_wellbeing,B_demo,B_general,B_labor,B_historyeco,B_health,B_education,B_gamet,B_hhbehaviour,B_develop,B_business,B_wages,B_naturalr,B_public,B_demandlab,B_trade,B_industrial,B_intern,B_welfare,B_urbanregional,B_ecohistory,B_lawecon,B_collectivedec,B_financial,B_macroconsum,B_macropolicy,B_prod,B_inf,B_macromoney,B_macroprices,B_math,B_other
0,gender,well,gender,social,workers,economics,health,education,game,households,growth,employees,wage,environmental,tax,retirement,export,firms,foreign,externalities,regional,historical,corruption,political,banking,saving,fiscal,firms,information,inflation,fluctuations,agent based,cultural
1,students,social,children,network,worker,theory,00,school,experimental,consumption,innovation,concerns,wages,energy,public,supply,trade,duopoly,migration,individual,regions,gdp,legal,corruption,financial,unemployment,labor,pore,network,monetary,inflation,dynamics,social
2,tend,contributes,women,economics,unemployment,variety,care,students,experimentre,consumers,patent,creation,earnings,climata,fiscal,employment,integration,industry,international,inequality,transport,period,law,elections,banks,inflation,policy,manufacturing,beliefs,nominal,cycles,simulation,inequality
3,networks,living,childre,individuals,union,considered,consumption,university,subjects,preferences,development,effort,employees,oil,redistribution,labor,international,regulatory,country,welfare,spatial,business,enforcement,electoral,bank,consumption,spending,firm,learning,policy,economy,spatial,well
4,women,socio,parents,prova,wage,evolutionary,treatment,educational,games,demand,knowledge,worker,pay,resource,government,labour,country,firm,exchangere,society,local,world,system,enforcement,portfolio,aggregata,fiscal policy,value,expectations,banks,recession,estimation,members
5,negativo impact,indicators,life,building,labour market,complex,female,research,experiments,household,innovativa,corporare,labour,pollution,benefits,human capital,tradere,entry,domestic,income,location,growth,supports,citizens,risk,gdp,macroeconomic,organizations,risk,asset,cycle,var,financial markets
6,scheme,measured,population,samplere,job,system,endogenous,evaluation,perfect,saving,technologies,variety,labor market,green,taxes,work,exports,consumers,linkages,social,specifications,united,lead,groups,asset,investment,tax,frontier,uncertaintyo,central,shocks,system,willingness
7,field,migration,female,life,employment,particulare,likely,application,evolutionary,single,technological,knowledge,workers,emissions,provision,human,global,pricing,direct,individuals,areas,capitare,property,social,credit,sensitivity,green,cournot,project,money,output,results show,static
8,application,countries,demographic,cultural,migration,great,quality,south,participants,event,technology,taxation,students,carbon,security,children,regions,industrial,exchangere rare,organizations,area,population,contract,outcome,stock,rola,real,prices,decision,monetary policy,based,investigate,money
9,explain,households,endogenous,lower,mobility,problems,population,academic,cooperation,consumer,developmentre,workers,skilled,sustainable,debt,article,union,business,stronger,prova,eu,cycles,effect,government,volatility,general equilibrium,economy,institutions,signal,uncertaintyo,monetary,statistical,people


In [24]:
result_no_stopwords = pd.DataFrame.from_dict(results)

In [41]:
print('MEAN F1: ', np.round(results.loc['f1'].mean(),3))
print('STOPWORDS MEAN F1: ', np.round(result_no_stopwords.loc['f1'].mean(),3))

MEAN F1:  0.793
STOPWORDS MEAN F1:  0.789


In [50]:
#pd.DataFrame.from_dict(insight)

In [51]:
#pd.concat([insight[t]['feature'][0:10] for t in target_fields],axis=1,ignore_index=False)

In [None]:
pd.DataFrame.from_dict(results)kf = KFold(n_splits=5)
    
if not(selector is None):
    X_sel = selector.fit_transform(X, y)
else:
    X_sel = X_train

most_frequent = DummyClassifier(strategy='most_frequent')
most_frequent.fit(X_sel, y)
# get the performance on the development set
dumb_predictions = most_frequent.predict(X_sel)
print("DUMB predictor")
#print(classification_report(y, dumb_predictions))

print("Multi-output Random Forest")
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

for train_index, test_index in kf.split(X_train):
    print("SPLIT")
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    y_preds = multi_target_forest.fit(X_train_cv, y_train_cv).predict(X_test_cv)

    print(classification_report(y_test_cv, preds))

In [None]:
kf = KFold(n_splits=5)

print("Multi-output LR")
classifier = LogisticRegression(class_weight='balanced',
                                        solver='saga',
                                        penalty='elasticnet', 
                                        l1_ratio=0.5, 
                                        n_jobs=-1)
multi_target_forest = MultiOutputClassifier(classifier, n_jobs=-1)

for train_index, test_index in kf.split(X_train):
    print("SPLIT")
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    y_preds = multi_target_forest.fit(X_train_cv, y_train_cv).predict(X_test_cv)

    print(classification_report(y_test_cv, preds))

Multi-output LR
SPLIT


In [25]:
train['micro'].value_counts()

0.0    5791
1.0    3223
Name: micro, dtype: int64

In [178]:
I[I['class'] == 'Kavanaugh']#.to_excel('data/indicators_judges.xlsx', index=None)

Unnamed: 0,feature,class,coefficient
3869,court declare,Kavanaugh,4.754086
4352,dangerousness,Kavanaugh,4.546067
2323,case reasonable,Kavanaugh,3.775475
2322,case reason,Kavanaugh,3.003807
3547,contrast court,Kavanaugh,2.716101
4250,criminal offense,Kavanaugh,2.685944
4757,deterrent,Kavanaugh,2.595863
3865,court decision,Kavanaugh,2.508349
67,00 stat,Kavanaugh,2.436689
3619,corporate,Kavanaugh,2.358015


In [179]:
y = df.TEXT_TYPE

X_sel = evaluate(X, y, selector)
insights(X_sel, y, selector)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          NA       0.00      0.00      0.00         1
  concurring       0.00      0.00      0.00       901
  dissenting       0.00      0.00      0.00      1192
        main       0.43      1.00      0.60      1598

    accuracy                           0.43      3692
   macro avg       0.11      0.25      0.15      3692
weighted avg       0.19      0.43      0.26      3692

              precision    recall  f1-score   support

          NA       0.00      0.00      0.00         1
  concurring       0.79      0.82      0.81       901
  dissenting       0.87      0.82      0.84      1192
        main       0.98      0.99      0.98      1598

    accuracy                           0.90      3692
   macro avg       0.66      0.66      0.66      3692
weighted avg       0.90      0.90      0.89      3692





Unnamed: 0,feature,class,coefficient
5539,eng rep,main,5.420973
1875,bench,main,5.310017
1874,belong,main,5.303003
2060,brown,main,4.423952
3463,constitutionally protect,main,4.176779
3462,constitutionally permissible,main,4.166164
2891,co united states,main,3.871987
2059,brothers,main,3.584215
1115,an,main,3.297807
2702,claim allow,main,3.00062


In [154]:
y = df.CAT1

X_sel = evaluate(X, y, selector)
insights(X_sel, y, selector)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                             precision    recall  f1-score   support

    ADMINISTRATIVE PRACTICE       0.00      0.00      0.00        18
                AGRICULTURE       0.00      0.00      0.00        19
                  ANTITRUST       0.00      0.00      0.00        27
                 BANKRUPTCY       0.00      0.00      0.00        71
     BUSINESS ORGANIZATIONS       0.00      0.00      0.00         2
               CIVIL RIGHTS       0.00      0.00      0.00       443
             COMMERCIAL LAW       0.00      0.00      0.00        59
                 COPYRIGHTS       0.00      0.00      0.00        23
           CRIMINAL JUSTICE       0.32      1.00      0.48      1174
                  ECOMMERCE       0.00      0.00      0.00         9
                  EDUCATION       0.00      0.00      0.00        85
       ENERGY AND UTILITIES       0.00      0.00      0.00        56
          ENVIRONMENTAL LAW       0.00      0.00      0.00        80
ESTATE PLANNING AND PROBATE      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                             precision    recall  f1-score   support

    ADMINISTRATIVE PRACTICE       0.26      0.67      0.38        18
                AGRICULTURE       0.29      0.21      0.24        19
                  ANTITRUST       0.79      0.85      0.82        27
                 BANKRUPTCY       0.84      0.90      0.87        71
     BUSINESS ORGANIZATIONS       0.00      0.00      0.00         2
               CIVIL RIGHTS       0.74      0.48      0.58       443
             COMMERCIAL LAW       0.33      0.49      0.39        59
                 COPYRIGHTS       0.76      0.83      0.79        23
           CRIMINAL JUSTICE       0.89      0.90      0.90      1174
                  ECOMMERCE       0.30      0.89      0.44         9
                  EDUCATION       0.50      0.76      0.61        85
       ENERGY AND UTILITIES       0.67      0.50      0.57        56
          ENVIRONMENTAL LAW       0.76      0.79      0.77        80
ESTATE PLANNING AND PROBATE      

Unnamed: 0,feature,class,coefficient
5804,excess,VETERANS,2.671189
5805,excessive,VETERANS,2.560751
5777,exacting,VETERANS,2.063851
4857,disadvantage,VETERANS,1.870035
2169,capital murder,VETERANS,1.459619
...,...,...,...
256,0d citation,ADMINISTRATIVE PRACTICE,1.627738
263,0d colo,ADMINISTRATIVE PRACTICE,1.546033
4290,ct,ADMINISTRATIVE PRACTICE,1.525705
262,0d collect case,ADMINISTRATIVE PRACTICE,1.450465


In [155]:
y = df.CAT2

X_sel = evaluate(X, y, selector)
insights(X_sel, y, selector)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

                                     0.00      0.00      0.00         5
                      Abortion       0.00      0.00      0.00        26
                    Abstention       0.00      0.00      0.00         3
              Additions To Tax       0.00      0.00      0.00         1
       Administrative Practice       0.00      0.00      0.00         3
                     Admission       0.00      0.00      0.00        13
                      Adoption       0.00      0.00      0.00         1
                   Advertising       0.00      0.00      0.00         6
                   Agriculture       0.00      0.00      0.00         1
                        Aliens       0.00      0.00      0.00         2
Alternative Dispute Resolution       0.00      0.00      0.00        25
                     Antitrust       0.00      0.00      0.00         1
                       Appeals       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

                                     0.26      1.00      0.42         5
                      Abortion       0.81      0.96      0.88        26
                    Abstention       0.38      1.00      0.55         3
              Additions To Tax       0.00      0.00      0.00         1
       Administrative Practice       0.60      1.00      0.75         3
                     Admission       0.35      0.69      0.46        13
                      Adoption       0.00      0.00      0.00         1
                   Advertising       0.00      0.00      0.00         6
                   Agriculture       0.00      0.00      0.00         1
                        Aliens       0.29      1.00      0.44         2
Alternative Dispute Resolution       0.44      0.76      0.56        25
                     Antitrust       0.00      0.00      0.00         1
                       Appeals       0.00      0.00      0.00  

Unnamed: 0,feature,class,coefficient
3356,consequence court,Zoning And Planning,1.436060
5365,eeoc ed,Zoning And Planning,1.039526
5974,fact intensive,Zoning And Planning,0.718291
3557,control law,Zoning And Planning,0.699847
229,0d alteration,Zoning And Planning,0.671328
...,...,...,...
1122,analysis case,,0.675761
776,admin news,,0.646892
250,0d case court,,0.607528
5482,employment practice,,0.549907


In [162]:
len(df.CAT2.unique())

332