In [1]:
#importando bibliotecas de análise inicial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Puxando a base de dados
df = pd.read_excel('dados_excel.xlsx')
categorize_label = lambda x: x.astype('category')
df[['Subcategoria']] = df[['Subcategoria']].apply(categorize_label)
valor = df['Montante']

In [3]:
# Importando as bibliotecas para normalização do texto
from unicodedata import normalize
import re

In [4]:
# Fazendo download de palavras que queremos retirar
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Daniel
[nltk_data]     Beigelman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Criando lista com palavras indesejadas
stop = stopwords.words("portuguese")
lista_stop = [normalize('NFKD', x).encode('ASCII', 'ignore').decode('ASCII') for x in stop] 

In [6]:
for i in ['brasilia','sao paulo','br','osasco','lago norte','lago nort']:
    lista_stop.append(i)

In [7]:
def correct_text(raw_review):
    # Função limpa e normaliza o texto para aprendizagem
    letters_only=re.sub("[^a-zA-Z]"," ",normalize('NFKD', raw_review).encode('ASCII', 'ignore').decode('ASCII'))
    words = letters_only.lower().split()
    meaningful_words=[w for w in words if not w in lista_stop]
    return(' '.join(meaningful_words))

In [8]:
# Define o o tamanho do vetor
num_desc = df['Nota'].size

In [9]:
# Cria vetor com todas as descrições
clean_desc=[]
for i in range(num_desc):
    clean_desc.append(correct_text(df['Nota'][i]))

In [10]:
from sklearn.model_selection import train_test_split
# Split dos dados em treino e validação
X_train, X_test, y_train, y_test = train_test_split(clean_desc, df["Subcategoria"], test_size=0.2, random_state=3)

In [11]:
# Importando todas as bibliotecas para tratamento e treinamento dos dados
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import AdaBoostClassifier

In [12]:
get_text_data = FunctionTransformer(lambda x: x['desc'].values, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x['valor'].values, validate=False)

In [13]:
lgr = OneVsRestClassifier(LogisticRegression(random_state=3, multi_class="multinomial",solver="lbfgs", C=10))
rnd_clf = RandomForestClassifier(n_estimators=100,max_leaf_nodes=10, n_jobs=-1)

pl = Pipeline([
        ('vec', CountVectorizer(analyzer='word',ngram_range=(1,1), stop_words = None)),
        ('voting_clf', VotingClassifier(estimators=[('lr', lgr), ('rf', rnd_clf)],voting='soft'))
    ])
pl.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...   warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None))])

In [14]:
accuracy = pl.score(X_test, y_test)
accuracy

0.9176470588235294

# Implementando com descrição e valor

In [15]:
class ArrayCaster(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        print(data.shape)
        print(np.transpose(np.matrix(data)).shape)
        return np.transpose(np.matrix(data))

In [16]:
# Juntando os dados e embaralhando
clean_desc_2 = pd.DataFrame(clean_desc)
df_cor = pd.concat([clean_desc_2,valor,df["Subcategoria"]],axis=1)
df_cor.columns = ['desc','valor','subcat']
df_cor = df_cor.sample(frac=1,random_state=42).reset_index(drop=True)

X = df_cor[['desc','valor']]
y = df_cor['subcat']

In [17]:
from sklearn.model_selection import train_test_split
# Split dos dados em treino e validação
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=3)

In [18]:
# pl2 = Pipeline([
#         ('union', FeatureUnion(
#             transformer_list = [
#                 ('text_fetuares', Pipeline([
#                     ('selector', get_text_data),
#                     ('vectorizer', CountVectorizer(analyzer='word',ngram_range=(1,1), stop_words = None))
#                 ])),
#                     ('numeric_features', Pipeline([
#                     ('selector', get_numeric_data),
#                     ('caster', ArrayCaster())
#                 ]))
#              ]
#         )),
#         ('voting_clf', VotingClassifier(estimators=[('lr', lgr), ('rf', rnd_clf)],voting='soft'))
#     ])
# pl.fit(X_train_2, y_train_2)

In [19]:
# accuracy = pl2.score(X_test_2, y_test_2)
# accuracy