# Desafio Mercado Livre - Cleyton Turin

In [1]:
import numpy as np 
import pandas as pd 
from sklearn import preprocessing 
import time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

import unicodedata

import pickle

from sklearn.feature_extraction.text import CountVectorizer 

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split 

from sklearn.metrics import classification_report

import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cleytonturin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cleytonturin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#conectando ao googleDrive

from google.colab import drive

drive.mount('/content/drive')

## Formulação do problema:
    1 - Construir um classificador de produto de acordo com o titulo do produto;
   

## Carregar os dados em um dataframe e preparar os dados

In [4]:
df_inicial = pd.read_csv('../train.csv')

In [7]:
#df = pd.read_csv('train.csv')
# create a dataframe
df = pd.DataFrame(df_inicial[['title','category']].sample(200000))
#df.columns=['text','label']
df.shape
df.to_csv('sample_train.csv')

### Criando funções de limpeza

In [5]:
def limpezaStrings(texto):
    texto=unicodedata.normalize('NFKD', str(texto).lower()).encode('ASCII', 'ignore').decode('utf8')  
    special_char_list = [':', ';', '?', '}', ')', '{', '(','/', '!']
    for special_char in special_char_list:
        texto=texto.replace(special_char, '')
    texto=re.sub(r'[^\w \xC0-\xFF]',' ',texto)
    texto=re.sub(r'((www\.[\s]+)|(https?://[^\s]+))',' ',texto)
    texto=re.sub(r'(\s[a-zA-Z]\s)|(\s[a-zA-Z]$)',' ',texto)
    texto=re.sub(r'[0-9]',' ',texto)
    texto=re.sub(r'\s([a-zA-Z]{1,3})\s',' ',texto)
    texto=re.sub(r'\s+',' ',texto)
    return texto

def removeStopWords(texto):
    pt_stopwords = stopwords.words('portuguese') 
    es_stopwords = stopwords.words('spanish') 
    palavras = [i for i in texto.split() if not i in pt_stopwords]
    palavras = [i for i in texto.split() if not i in es_stopwords]
    palavras=palavras[0:15]
    return (" ".join(palavras))

### Aplicando funções de limpeza e tokenizando coluna title

In [None]:
print ('iniciou limpa str: ',time.strftime("%H:%M:%S"))
df['title']=df['title'].apply(limpezaStrings)
print ('concluiu limpa str: ',time.strftime("%H:%M:%S"))

print ('iniciou remove stopword: ',time.strftime("%H:%M:%S"))
df['title']=df['title'].apply(removeStopWords)
print ('concluiu remove stopword: ',time.strftime("%H:%M:%S"))
df.to_csv('base_geral_limpa.csv', index=False)
#!cp base_geral_limpa.csv drive/My\ Drive/python/desafioML/

iniciou limpa str:  10:57:25


### Cria bag of **Words**

In [None]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=word_tokenize,ngram_range=(1,2))
print ('iniciou bag of words: ',time.strftime("%H:%M:%S"))
bagofWord = vectorizer.fit_transform(df['title'])
tf_transformer=TfidfTransformer().fit(bagofWord)
bagofWord=tf_transformer.transform(bagofWord)
print ('concluiu bag of words: ',time.strftime("%H:%M:%S"))
filename = 'bagofword'
outfile = open(filename,'wb')
pickle.dump(bagofWord,outfile)
outfile.close()
#!cp bagofword drive/My\ Drive/python/desafioML/

# A abordagem de classificação

In [14]:
#carrega amostra
df=pd.read_csv('base_geral_limpa.csv')
pickle_in = open("bagofWord","rb")
bagofWord = pickle.load(pickle_in)

print(df.shape)
print(bagofWord.shape)



(2000000, 2)
(2000000, 274122)


 ### Separando o dataframe em teste e treinamento

In [None]:
print ('iniciou separacao de teste e treino: ',time.strftime("%H:%M:%S"))
X_train, X_test, y_train, y_test = train_test_split(bagofWord, df['category'],stratify=df['category'], train_size=0.9,test_size = 0.1, random_state=42)
print ('concluiu separacao de teste e treino: ',time.strftime("%H:%M:%S"))

In [None]:
#Instanciando a classe, treinando e verificando o desempenho
model = OneVsRestClassifier(LinearSVC(dual=False,max_iter=1500))
print ('iniciou treinamento: ',time.strftime("%H:%M:%S"))
model.fit(X_train,y_train)
print ('concluiu treinamento: ',time.strftime("%H:%M:%S"))
print ('iniciou predict: ',time.strftime("%H:%M:%S"))
y_test_hat=model.predict(X_test)
print ('concluiu predict: ',time.strftime("%H:%M:%S"))
print('Precisao do model classifier na base de teste: {:.5f}\n'.format(model.score(X_test, y_test)))
#print(classification_report(y_test, y_test_hat))

In [None]:
test = pd.read_csv('test.csv')
test['title']=test['title'].apply(limpezaStrings)
print(1)
test['title']=test['title'].apply(removeStopWords)
print(2)
bwTest=vectorizer.transform(test['title'])
bwTest=tf_transformer.transform(bwTest)
print(3)
#bwTest=preprocessing.normalize(bwTest)
print(4)
test['category']=model.predict(bwTest)

In [None]:
test[['id','category']].to_csv('resposta10kk.csv', index=False)
print(1)

In [None]:
#!cp resposta1k.csv drive/My\ Drive/python/desafioML/

In [None]:
y_test_hat=model.predict(X_test)
print(classification_report(y_test, y_test_hat))

In [None]:
filename = 'modelo5k'
outfile = open(filename,'wb')
pickle.dump(model,outfile)
outfile.close()
#!cp modelo.pk drive/My\ Drive/python/desafioML/