In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
SIZE = 1000

# Análise de dados

In [3]:
dataset = pd.read_pickle('clean_train')
dataset = dataset.sample(frac=1, random_state=42)
dataset

Unnamed: 0_level_0,pub_title,dataset_title,dataset_label,cleaned_label,text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5c3600bb-69d6-4091-8f54-98cf5f542436,Evaluating the Effects of a Coastal Spine on ...,"Sea, Lake, and Overland Surges from Hurricanes",SLOSH model,slosh model,"[{""section_title"": """", ""text"": ""\u2212 Residen..."
2656f4a5-c64d-440f-bf2e-d6146b9da21c,Demographic shift in COVID-19 patients in Sing...,Our World in Data COVID-19 dataset,Our World in Data,our world in data,"[{""section_title"": ""Introduction"", ""text"": ""Si..."
5a7feb99-c6ba-417d-b9d6-4a789bb12305,Response: Impact On Knowledge Of Farm Operators,Agricultural Resource Management Survey,Agricultural Resource Management Survey,agricultural resource management survey,"[{""section_title"": """", ""text"": ""I will discuss..."
f0014462-68dd-4d17-8a07-dbc9beba0a28,Immigration and Innovation: Chinese Graduate S...,Survey of Earned Doctorates,Survey of Earned Doctorates,survey of earned doctorates,"[{""section_title"": ""Abstract"", ""text"": ""ISBN 9..."
1cee4871-fdc0-462e-b29b-c3fabf13a7b5,Framework for understanding the patterns of st...,Beginning Postsecondary Student,Beginning Postsecondary Students,beginning postsecondary students,"[{""section_title"": ""Abstract"", ""text"": ""Abstra..."
...,...,...,...,...,...
e1c78694-d96b-487f-b445-fd692c5fb84e,Advances in longitudinal studies of amnestic m...,Alzheimer's Disease Neuroimaging Initiative (A...,ADNI,adni,"[{""section_title"": ""Abstract"", ""text"": ""Amnest..."
10a7d47c-cd38-4763-bb4b-e5804a670b90,Evaluation of national responses to COVID-19 p...,Our World in Data COVID-19 dataset,Our World in Data,our world in data,"[{""section_title"": ""Introduction"", ""text"": ""Th..."
622123b8-bed9-4f4f-b026-158e552f0839,Connectopathy in ageing and dementia,Alzheimer's Disease Neuroimaging Initiative (A...,ADNI,adni,"[{""section_title"": """", ""text"": ""Connectivity i..."
90dad306-ae3b-4016-9f60-cf45d76bc0f2,Serum 25(OH)D and Cognition: A Narrative Revie...,Baltimore Longitudinal Study of Aging (BLSA),Baltimore Longitudinal Study of Aging (BLSA),baltimore longitudinal study of aging blsa,"[{""section_title"": ""Abstract"", ""text"": ""The ef..."


### Pipeline do tutorial do SKLearn: CountVectorizer, TfidfTransformer, e MultinomialNB

### Começo criando e ajustando o algoritmo que classifica os textos em termos do dataset que estaria citando

### CountVectorizer produz uma matriz esparsa com a frequência absoluta dos termos (palavras) em cada documento. Essa matriz esparsa se chama "document term matrix" (DTM)

In [4]:
CV = CountVectorizer()
document_term_matrix = CV.fit_transform(dataset["text"].iloc[:SIZE])
document_term_matrix

<1000x134910 sparse matrix of type '<class 'numpy.int64'>'
	with 1311158 stored elements in Compressed Sparse Row format>

### TfidfTransformer usa o DTM obtido acima para produz outra matriz esparsa, desta vez contendo a frequência relativa dos termos em cada documento, ponderados inversamente pela sua frequência em outros documentos. Esta matriz se chama TF-IDF

In [5]:
TF = TfidfTransformer()
tfidf_doc_term_mtx = TF.fit_transform(document_term_matrix)
tfidf_doc_term_mtx

<1000x134910 sparse matrix of type '<class 'numpy.float64'>'
	with 1311158 stored elements in Compressed Sparse Row format>

### MultinomialNB é um classificador de tipo *naive bayes* que treina usando matriz TF-IDF com as respostas fornecidas no dataset

In [6]:
classifier = MultinomialNB()
classifier = classifier.fit(tfidf_doc_term_mtx, dataset["cleaned_label"].iloc[:SIZE])
classifier

MultinomialNB()

### Testaremos agora nosso classificador

In [7]:
DTM_new = CV.transform(dataset["text"].iloc[SIZE : 2*SIZE])
DTM_new

<1000x134910 sparse matrix of type '<class 'numpy.int64'>'
	with 1220873 stored elements in Compressed Sparse Row format>

In [8]:
TFIDF_new = TF.transform(DTM_new)
TFIDF_new

<1000x134910 sparse matrix of type '<class 'numpy.float64'>'
	with 1220873 stored elements in Compressed Sparse Row format>

In [9]:
predictions = classifier.predict(TFIDF_new)
predictions

array(['adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
       'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni', 'adni',
      