In [85]:
import pandas as pd
import numpy as np
import json
import nltk
import re 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn import metrics

#### Functions for data

In [9]:
# separating words from tags
def words(l):
    words, tags = zip(*l)
    return [word for word in words]

def tags(l):
    words, tags = zip(*l)
    return [tag.lower() for tag in tags]


### Features Dataframe
Vectorizing all the words and tags, labelling positive/negative and validating with supervised classification models.

In [86]:
# importing
with open('tokens.json') as json_data:
    data = json.load(json_data)
    
features = pd.DataFrame()
features['main'] = data.values()
features['words'] = features['main'].apply(words).apply(' '.join)
features['tags'] = features['main'].apply(tags).apply(' '.join)

#### Polarity Checking for Positive/Negative Labels

In [70]:
# averaging words more common in negative rullings
lengths = features['words'].apply(lambda x: len(x.split()))

polar = np.diag(features['main'].apply(lambda x: (len(re.findall('nem', str(x)))+\
                                              len(re.findall('não', str(x)))+\
                                              len(re.findall('inexiste', str(x)))+\
                                              len(re.findall('negado', str(x)))+\
                                              len(re.findall('deveria', str(x)))+\
                                              len(re.findall('falta', str(x)))+\
                                              len(re.findall('demonstre', str(x)))+\
                                              len(re.findall('improced.+', str(x)))*1.5+\
                                              len(re.findall('insucesso', str(x))))/lengths))

In [71]:
features['polarity'] = [1 if p <= 0.03 else 0 for p in polar]
features['polarity'].sum()

9822

#### Supervised Learning 
Using classification to validate the labels

In [9]:
X = np.array(features['words'])
y = np.array(features['polarity'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=17)

In [10]:
#LogisticRegression
pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(
                            LogisticRegression(), n_jobs=1))])

parameters = {"clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None],
                     "clf__estimator__solver": ['saga', 'lbfgs'],
                          "clf__estimator__max_iter": [100, 500]
             }

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, verbose=10)
grid_search_tune.fit(X_train,y_train)
best_log = grid_search_tune.best_estimator_

y_pred_log = best_log.predict(X_test)
print(classification_report(y_test, y_pred_log))

cross_val_score(best_log, X, y, cv=5)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   20.0s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   30.9s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   35.6s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   51.7s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:  2.0min finished


              precision    recall  f1-score   support

           0       0.66      0.64      0.65       478
           1       0.67      0.68      0.68       507

    accuracy                           0.66       985
   macro avg       0.66      0.66      0.66       985
weighted avg       0.66      0.66      0.66       985



array([0.63451777, 0.65228426, 0.65126904, 0.64213198, 0.65396341])

In [12]:
#LinearSVC
pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(
                            LinearSVC(), n_jobs=1))])

parameters = {"clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None]
             }

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, verbose=10)
grid_search_tune.fit(X_train,y_train)
best_svc = grid_search_tune.best_estimator_

y_pred_svc = best_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc))

cross_val_score(best_svc, X, y, cv=5)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    8.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   16.3s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   20.5s
[Parallel(n_jobs=3)]: Done  15 out of  18 | elapsed:   25.5s remaining:    5.0s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:   30.1s finished


              precision    recall  f1-score   support

           0       0.66      0.63      0.64       478
           1       0.67      0.69      0.68       507

    accuracy                           0.66       985
   macro avg       0.66      0.66      0.66       985
weighted avg       0.66      0.66      0.66       985



array([0.63451777, 0.64873096, 0.65228426, 0.64263959, 0.65142276])

In [13]:
grid_search_tune.best_params_

{'clf__estimator__C': 0.1, 'clf__estimator__class_weight': 'balanced'}

#### For better results on finding negative rullings

In [30]:
polar = np.diag(features['main'].apply(lambda x: (len(re.findall('nem', x))*1.2+\
                                              len(re.findall('não', x))+\
                                              len(re.findall('inexiste', x))*1.2+\
                                              len(re.findall('negado', x))*1.2+\
                                              len(re.findall('deveria', x))+\
                                              len(re.findall('falta', x))*1.2+\
                                              len(re.findall('demonstre', x))*+\
                                              len(re.findall('improced.+', x))*1.5+\
                                              len(re.findall('insucesso', x))*1.2)/lengths))

In [31]:
features['neg_polarity'] = [1 if p <= 0.025 else 0 for p in polar]

In [16]:
X = np.array(features['words'])
y = np.array(features['neg_polarity'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(
                            LinearSVC(), n_jobs=1))])

parameters = {"clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None]
             }

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, verbose=10)
grid_search_tune.fit(X_train,y_train)
best_svc = grid_search_tune.best_estimator_

y_pred_svc = best_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc))

cross_val_score(best_svc, X, y, cv=5)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   12.4s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   20.0s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   24.4s
[Parallel(n_jobs=3)]: Done  15 out of  18 | elapsed:   28.5s remaining:    5.6s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:   33.4s finished


              precision    recall  f1-score   support

           0       0.65      0.83      0.73       874
           1       0.59      0.36      0.45       604

    accuracy                           0.64      1478
   macro avg       0.62      0.59      0.59      1478
weighted avg       0.63      0.64      0.62      1478



array([0.65076142, 0.64923858, 0.65482234, 0.65718639, 0.65109192])

In [None]:
features['neg_polarity'].sum()

#### Balancing Data

In [17]:
X1 = features[features['neg_polarity'] == 1]['words'][:3000]
y1 = features[features['neg_polarity'] == 1]['polarity'][:3000]
X2 = features[features['neg_polarity'] == 0]['words'][:3000]
y2 = features[features['neg_polarity'] == 0]['polarity'][:3000]

X = pd.concat([X1,X2])
y = pd.concat([y1,y2])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=17)

pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(
                            LinearSVC(), n_jobs=1))])

parameters = {"clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None]
             }

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, verbose=10)
grid_search_tune.fit(X_train,y_train)
best_nsvc = grid_search_tune.best_estimator_

y_pred_nsvc = best_nsvc.predict(X_test)
print(classification_report(y_test, y_pred_nsvc))

cross_val_score(best_nsvc, X, y, cv=5)
# good model for predicting positive sentences

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    9.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   13.9s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   16.6s
[Parallel(n_jobs=3)]: Done  15 out of  18 | elapsed:   18.8s remaining:    3.7s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:   20.9s finished


              precision    recall  f1-score   support

           0       0.64      0.37      0.47       243
           1       0.67      0.86      0.75       357

    accuracy                           0.66       600
   macro avg       0.66      0.61      0.61       600
weighted avg       0.66      0.66      0.64       600



array([0.66527893, 0.67110741, 0.6675    , 0.63886572, 0.58298582])

#### Exporting Labelled Data

In [32]:
features['main'] = uniq['main']

with open("labelled.json", "w") as f:
    json.dump(features.to_dict(), f)

ValueError: cannot reindex from a duplicate axis

In [27]:
#LinearSVC
with open('tag_stems_topics.json') as json_data:
    data_stem = json.load(json_data)
    
stemmed = {k:' '.join(v) for k,v in data_stem.items()}
df_stem = pd.DataFrame.from_dict(stemmed, orient='index')
words_stem = pd.Series(df_stem[0].apply(lambda x: x.split()))

In [42]:
X = np.array(features['new'])
y = np.array(features['polarity'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
X_train

array([nan, nan, nan, ..., nan, nan, nan], dtype=object)

In [40]:
pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(
                            LinearSVC(), n_jobs=1))])

parameters = {"clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None]
             }

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, verbose=10)
grid_search_tune.fit(X_train,y_train)
best_svc = grid_search_tune.best_estimator_

y_pred_svc = best_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc))

cross_val_score(best_svc, X, y, cv=5)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    4.1s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    7.4s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    9.2s
[Parallel(n_jobs=3)]: Done  15 out of  18 | elapsed:   11.0s remaining:    2.1s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:   12.5s finished


              precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       1.00      1.00      1.00      1475

    accuracy                           1.00      1478
   macro avg       0.67      0.67      0.67      1478
weighted avg       1.00      1.00      1.00      1478



array([0.99543379, 0.99796954, 0.99796851, 0.99847638, 0.99746064])

In [22]:
features['words']

0       acord despach profer aut pres cas decis conden...
1       def pres reclam confer admit recurs circunscri...
2       estabelec hospit contribu trat assist vítim ti...
3       form admit revist excepc fundament art cpc rel...
4       result artig ccivil val bem do mesm dat abert ...
                              ...                        
9843    matér fact pod ser alter stj verif algum funda...
9844    sed sane sentenç juiz dev term art cpc começ c...
9845    decid atribu relev caus exclus culp aleg ré ag...
9846    stj pod censur mau uso tribun relaç event feit...
9847    constitu associ particip situ alguém exerc act...
Name: words, Length: 9848, dtype: object

In [38]:
words_stem.apply(' '.join)

0       acord despach prof aut pre ca decil conden lit...
1       def pre conf admit recur circunscrit aleg viol...
2       estabelec hospit contribu trat assist vítim ti...
3       form admit revist excepc fundament art cpc rel...
4       result artig ccivil val mesm dat abert suces v...
                              ...                        
9843    matér fact pod stj verif fundament previst par...
9844    sed san sentenç juiz dev term art cpc começ co...
9845    decid atribu relev cau exclu culp aleg convicç...
9846    stj pod censur mau uso tribun relaç event feit...
9847    constitu assoc particip situ exerc activ econó...
Name: 0, Length: 9848, dtype: object

In [29]:
features['main']

0       [[acord, NOUN], [despach, NOUN], [profer, VERB...
1       [[def, VERB], [pres, ADJ], [reclam, NOUN], [co...
2       [[estabelec, NOUN], [hospit, NOUN], [contribu,...
3       [[form, NOUN], [admit, VERB], [revist, NOUN], ...
4       [[result, VERB], [artig, NOUN], [ccivil, NOUN]...
                              ...                        
9843    [[matér, NOUN], [fact, NOUN], [pod, VERB], [se...
9844    [[sed, NOUN], [sane, NOUN], [sentenç, NOUN], [...
9845    [[decid, VERB], [atribu, VERB], [relev, NOUN],...
9846    [[stj, NOUN], [pod, VERB], [censur, VERB], [ma...
9847    [[constitu, VERB], [associ, NOUN], [particip, ...
Name: main, Length: 9848, dtype: object

In [92]:
features['new'] = words_stem.apply(' '.join).reindex(range(len(words_stem))).to_list()

In [93]:
features

Unnamed: 0,main,words,tags,new
0,"[[acord, NOUN], [despach, NOUN], [profer, VERB...",acord despach profer aut pres cas decis conden...,noun noun verb noun adj noun noun noun noun ve...,
1,"[[def, VERB], [pres, ADJ], [reclam, NOUN], [co...",def pres reclam confer admit recurs circunscri...,verb adj noun noun verb noun verb noun verb no...,
2,"[[estabelec, NOUN], [hospit, NOUN], [contribu,...",estabelec hospit contribu trat assist vítim ti...,noun noun verb noun noun noun adj noun noun no...,
3,"[[form, NOUN], [admit, VERB], [revist, NOUN], ...",form admit revist excepc fundament art cpc rel...,noun verb noun adj noun noun noun adv noun ver...,
4,"[[result, VERB], [artig, NOUN], [ccivil, NOUN]...",result artig ccivil val bem do mesm dat abert ...,verb noun noun noun noun verb pron noun noun n...,
...,...,...,...,...
9843,"[[matér, NOUN], [fact, NOUN], [pod, VERB], [se...",matér fact pod ser alter stj verif algum funda...,noun noun verb verb verb noun verb pron noun v...,
9844,"[[sed, NOUN], [sane, NOUN], [sentenç, NOUN], [...",sed sane sentenç juiz dev term art cpc começ c...,noun noun noun noun verb noun noun noun verb v...,
9845,"[[decid, VERB], [atribu, VERB], [relev, NOUN],...",decid atribu relev caus exclus culp aleg ré ag...,verb verb noun verb noun noun verb noun verb n...,
9846,"[[stj, NOUN], [pod, VERB], [censur, VERB], [ma...",stj pod censur mau uso tribun relaç event feit...,noun verb verb adj noun noun noun adv verb nou...,


In [87]:
features.index, words_stem.index

(RangeIndex(start=0, stop=9848, step=1),
 RangeIndex(start=0, stop=9848, step=1))

In [94]:
len(words_stem.apply(' '.join).reindex(range(len(words_stem))).to_list())

9848

In [95]:
len(features)

9848

In [97]:
words_stem = words_stem.apply(' '.join).reindex(range(len(words_stem)))