In [1]:
import pandas as pd
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

## Data Cleaning

### Pre processing methods

In [2]:
from typing import List

import ssl
import unicodedata

import neattext as nt

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
from nltk.tokenize import wordpunct_tokenize

# nltk.download('stopwords')
# nltk.download('punkt')

import spacy
nlp = spacy.load('pt_core_news_lg')


def to_ascii(text: str) -> str:
    return unicodedata.normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII')

stopwords = ['a',
 'a',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'as',
 'ate',
 'com',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'do',
 'dos',
 'e',
 'e',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'era',
 'eram',
 'eramos',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'esta',
 'estamos',
 'estao',
 'estar',
 'estas',
 'estava',
 'estavam',
 'estavamos',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiveramos',
 'estiverem',
 'estivermos',
 'estivesse',
 'estivessem',
 'estivessemos',
 'estou',
 'eu',
 'foi',
 'fomos',
 'for',
 'fora',
 'foram',
 'foramos',
 'forem',
 'formos',
 'fosse',
 'fossem',
 'fossemos',
 'fui',
 'ha',
 'haja',
 'hajam',
 'hajamos',
 'hao',
 'havemos',
 'haver',
 'hei',
 'houve',
 'houvemos',
 'houver',
 'houvera',
 'houvera',
 'houveram',
 'houveramos',
 'houverao',
 'houverei',
 'houverem',
 'houveremos',
 'houveria',
 'houveriam',
 'houveriamos',
 'houvermos',
 'houvesse',
 'houvessem',
 'houvessemos',
 'isso',
 'isto',
 'ja',
 'lhe',
 'lhes',
 'mais',
 'mas',
 'me',
 'mesmo',
 'meu',
 'meus',
 'minha',
 'minhas',
 'muito',
 'na',
 'nao',
 'nas',
 'nem',
 'no',
 'nos',
 'nos',
 'nossa',
 'nossas',
 'nosso',
 'nossos',
 'num',
 'numa',
 'o',
 'os',
 'ou',
 'para',
 'pela',
 'pelas',
 'pelo',
 'pelos',
 'por',
 'que',
 'quem',
 'sao',
 'se',
 'seja',
 'sejam',
 'sejamos',
 'sem',
 'ser',
 'sera',
 'serao',
 'serei',
 'seremos',
 'seria',
 'seriam',
 'seriamos',
 'so',
 'somos',
 'sou',
 'tambem',
 'te',
 'tem',
 'tem',
 'temos',
 'tenha',
 'tenham',
 'tenhamos',
 'tenho',
 'tera',
 'terao',
 'terei',
 'teremos',
 'teria',
 'teriam',
 'teriamos',
 'teu',
 'teus',
 'teve',
 'tinha',
 'tinham',
 'tinhamos',
 'tive',
 'tivemos',
 'tiver',
 'tivera',
 'tiveram',
 'tiveramos',
 'tiverem',
 'tivermos',
 'tivesse',
 'tivessem',
 'tivessemos',
 'tu',
 'tua',
 'tuas',
 'um',
 'uma',
 'voces',
 'vos'
 ]

def clean_text(text: str)->List[str]:
    # Lower Case
    text = text.lower()

    # Lemmatization
    text = " ".join([x.lemma_.lower() for x in nlp(text)])

    # Remover stopwords
    text = " ".join([ x for x in text.split(' ') if x not in stopwords])

    # Remover Acentuaçāo
    text = to_ascii(text)

    # Remover Pontuaçāo, Números, Urls, emails e caracteres especiais
    docx = nt.TextFrame(text=text)
    text = docx.remove_puncts().remove_numbers().remove_urls().remove_emails().remove_special_characters().text

    return text




### Dataset

In [4]:
df = pd.read_csv('./politica_set.csv')
df['text'] = df['text'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,company,text,class
0,amazon,notificacao privacidade amazon,0
1,amazon,ultimo atualizacao junho,0
2,amazon,controlador informacao pessoal,0
3,amazon,qual informacao pessoal sobre cliente amazon c...,1
4,amazon,qual finalidade amazon tratar seu informacao p...,2


In [6]:
df.groupby(['class'])['class'].count()

class
0     204
1      20
2      15
3      10
4      19
5      14
6       1
7       7
8      13
9       5
10      6
11     10
13     13
Name: class, dtype: int64

## Model Performance

In [76]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
Y = df['class']
    

## Model Tunning

In [77]:
def get_best_model_tunning(model, params):
    gd=GridSearchCV(estimator=model,param_grid=params,verbose=True)
    gd.fit(X,Y)
    print(gd.best_score_)
    print(gd.best_estimator_)

### Tunning SVC model

In [78]:
params = {'C':[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1],
         'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
         'kernel':['rbf','linear']
        }
get_best_model_tunning(SVC(), params)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits




0.8871817383669885
SVC(C=1, gamma=0.1, kernel='linear')


### Tunning Naive Bayes model

In [79]:
params = dict(alpha=np.logspace(0,-9, num=100),fit_prior=[True, False])
get_best_model_tunning(MultinomialNB(), params)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




0.7444249341527656
MultinomialNB(alpha=0.1)


### Tunning Logistic Regression

In [80]:
params = dict(penalty = ['l2'], C = [0.001,0.01,0.1,1,10,100,1000])
get_best_model_tunning(LogisticRegression(), params)

Fitting 5 folds for each of 7 candidates, totalling 35 fits




0.8842405618964004
LogisticRegression(C=100)


### Tunning Decision Tree Classifier

In [None]:
params = dict(
    max_features=list(range(20, 200)), 
    max_leaf_nodes = range(20, 60, 1),
    criterion=['gini', 'entropy'], 
    max_depth=range(10, 100, 1) 
)
get_best_model_tunning(DecisionTreeClassifier(), params)

Fitting 5 folds for each of 1296000 candidates, totalling 6480000 fits




### Tunning Random Forest

In [None]:
params = dict( 
    n_estimators=[200, 500],
    max_features=['sqrt','log2'],
    criterion=['gini', 'entropy']
)
get_best_model_tunning(RandomForestClassifier(), params)

### Evaluate Model performance

In [68]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
Y = df['class']

def get_performance(model):
    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) 
    score = cross_val_score(model, X, Y, cv=cv)
    print("Accuracy: %0.2f (+/-%0.2f)"%(score.mean(), score.std()*2))
    

In [69]:
get_performance(MultinomialNB(alpha=0.1))
get_performance(RandomForestClassifier())
get_performance(DecisionTreeClassifier(max_depth=38, max_features=180, max_leaf_nodes=54))
get_performance(LogisticRegression(C=100))
get_performance(SVC(C=1, gamma=0.1, kernel='linear'))

Accuracy: 0.67 (+/-0.05)
Accuracy: 0.82 (+/-0.08)
Accuracy: 0.85 (+/-0.08)
Accuracy: 0.86 (+/-0.05)
Accuracy: 0.87 (+/-0.03)


In [70]:
chosen_model = SVC(C=1, gamma=0.1, kernel='linear')

## Saving and importing classification model

### Saving model

In [46]:
#Saving model in hard disk
import pickle
filename = 'model.sav'
pickle.dump(chosen_model, open(filename, 'wb'))

#Saving Vectorizer
pickle.dump(vectorizer, open('vectorizer.pickle', 'wb')),

(None,)

### Importing model

In [48]:
# Import Model from hard disk
model = pickle.load(open(filename, 'rb'))
#Import Vectorizer
vectorizer = pickle.load(open('vectorizer.pickle', 'rb'))

# Testando mais parametros no random forest

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
# bootstrap = [True, False]
# Create the random grid
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap
         }
get_best_model_tunning(RandomForestClassifier(), params)

## Evaluating False Negatives and Positives

In [84]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)

vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train['text'])
test_features = vectorizer.transform(test['text'])
train_prediction = train['class']
test_prediction = test['class']

from sklearn.metrics import accuracy_score, confusion_matrix


model = chosen_model.fit(train_features, train_prediction)

predictions = model.predict(test_features)

confusion_matrix(test_prediction, predictions)

array([[63,  0,  1,  0,  0,  0,  0,  1,  0,  1,  1],
       [ 3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5]])