In [5]:
import sys
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')

import glob, os
import pandas as pd
import json
import spacy
import nltk
import numpy as np
import json
#import smogn
import seaborn as sns
import pickle


from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from src import utils_classification as utils
from sklearn.metrics import matthews_corrcoef
from sklearn.covariance import EllipticEnvelope

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

from multiprocessing import Process, Queue

In [2]:
with open('dataset2.p', 'rb') as fp:
    dataset = pickle.load(fp)

In [3]:
def oneclass_svm(X_train):
    
    svm = OneClassSVM(gamma='scale', kernel='rbf', nu=0.67)
    svm.fit(X_train)
    
    return svm

In [4]:
def oneclass_rf(X_train):
    
    rf = IsolationForest(n_estimators=1000, contamination=0.67, warm_start=True, n_jobs=-1)
    rf.fit(X_train)
    
    return rf

In [24]:
def train(dataset, name_models, sections, train_class=1):
    
    models = {}
    for section in sections:
        
        X_train = dataset[section][0].copy()
        y_train = dataset[section][2].copy()
        
        if train_class != 'all':
            X_train = X_train[y_train==train_class]
        else:
            X_train = X_train
        
        aux = {}
        for name_model in name_models:
        
            if name_model == "svm":
                aux['svm'] = oneclass_svm(X_train)
            
            if name_model == "rf":
                aux['rf'] = oneclass_rf(X_train)
                
        models[section] = aux
        
    return models
        
def predict(dataset, models, name_models, sections):

    
    results= {}
    predictions = {}
    cm = {}
    
    for section in sections:
        
        X_test = dataset[section][1].copy()
        y_test = dataset[section][3].copy()

        y_test[y_test == 1] = 1
        y_test[y_test == 0] = -1
        
        aux_results = {}
        aux_predictions = {}
        aux_cm = {}
        
        for name_model in name_models:
            
            aux_predictions[name_model] = models[section][name_model].predict(X_test)
            
            aux_results[name_model] = classification_report(y_test, aux_predictions[name_model],  labels=[-1, 1], output_dict=True)
            
            aux_cm[name_model] = [confusion_matrix(y_test, aux_predictions[name_model]).ravel()]
    
        predictions[section] = aux_predictions
        results[section] = aux_results
        cm[section] = aux_cm
    
    return predictions, results, cm

In [25]:
name_models = ['svm', 'rf']
sections=['introduction', 'materials', 'conclusion']

models = train(dataset, name_models, sections, train_class=1) 

In [26]:
predictions, results, cm = predict(dataset, models, name_models, sections)

In [30]:
pd.DataFrame(results['introduction']['svm']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.822662,0.584784,0.68362,4482.0
1,0.131186,0.332151,0.188086,846.0
accuracy,0.54467,0.54467,0.54467,0.54467
macro avg,0.476924,0.458467,0.435853,5328.0
weighted avg,0.712866,0.54467,0.604937,5328.0


In [31]:
pd.DataFrame(results['introduction']['rf']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.823326,0.584337,0.683544,4482.0
1,0.132278,0.335697,0.189776,846.0
accuracy,0.544857,0.544857,0.544857,0.544857
macro avg,0.477802,0.460017,0.43666,5328.0
weighted avg,0.713599,0.544857,0.605142,5328.0


In [32]:
pd.DataFrame(results['materials']['svm']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.881448,0.637603,0.739954,5574.0
1,0.121739,0.369393,0.183126,758.0
accuracy,0.605496,0.605496,0.605496,0.605496
macro avg,0.501594,0.503498,0.46154,6332.0
weighted avg,0.790504,0.605496,0.673297,6332.0


In [34]:
pd.DataFrame(results['materials']['rf']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.872978,0.590599,0.704548,5574.0
1,0.108942,0.368074,0.168123,758.0
accuracy,0.563961,0.563961,0.563961,0.563961
macro avg,0.49096,0.479337,0.436335,6332.0
weighted avg,0.781516,0.563961,0.640333,6332.0


In [33]:
pd.DataFrame(results['conclusion']['svm']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.930259,0.619105,0.743438,10751.0
1,0.074785,0.398795,0.125951,830.0
accuracy,0.603316,0.603316,0.603316,0.603316
macro avg,0.502522,0.50895,0.434695,11581.0
weighted avg,0.868948,0.603316,0.699183,11581.0


In [35]:
pd.DataFrame(results['conclusion']['rf']).transpose()

Unnamed: 0,precision,recall,f1-score,support
-1,0.926865,0.58469,0.717048,10751.0
1,0.069598,0.40241,0.118671,830.0
accuracy,0.571626,0.571626,0.571626,0.571626
macro avg,0.498232,0.49355,0.41786,11581.0
weighted avg,0.865426,0.571626,0.674163,11581.0


# tn, fp, fn, tp

In [11]:
cm

{'introduction': {'svm': [array([2621, 1861,  565,  281])],
  'rf': [array([2570, 1912,  549,  297])]},
 'materials': {'svm': [array([3554, 2020,  478,  280])],
  'rf': [array([3358, 2216,  486,  272])]},
 'conclusion': {'svm': [array([6656, 4095,  499,  331])],
  'rf': [array([6284, 4467,  502,  328])]}}

In [1]:
(2621/ (2621 + 1861))

0.5847835787594824