In [1]:
import pymongo
import openml
import json
import pandas as pd
from sklearn import datasets

# Criando uma database e um collection para o mongo

In [2]:
myClient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myClient["desafio"]
myCol = mydb["diabete"]

# Capturando todos os dados e inserindo em uma dataset

In [3]:
dataset = openml.datasets.get_dataset(37)

In [4]:
type(dataset)

openml.datasets.dataset.OpenMLDataset

In [5]:
print(dataset)

OpenML Dataset
Name..........: diabetes
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:22:13
Licence.......: Public
Download URL..: https://www.openml.org/data/v1/download/37/diabetes.arff
OpenML URL....: https://www.openml.org/d/37
# of features.: 9
# of instances: 768


In [6]:
print(dataset.get_data(dataset_format = "dataframe", target = dataset.default_target_attribute))

(     preg  plas  pres  skin   insu  mass   pedi  age
0       6   148    72    35    0.0  33.6  0.627   50
1       1    85    66    29    0.0  26.6  0.351   31
2       8   183    64     0    0.0  23.3  0.672   32
3       1    89    66    23   94.0  28.1  0.167   21
4       0   137    40    35  168.0  43.1  2.288   33
..    ...   ...   ...   ...    ...   ...    ...  ...
763    10   101    76    48  180.0  32.9  0.171   63
764     2   122    70    27    0.0  36.8  0.340   27
765     5   121    72    23  112.0  26.2  0.245   30
766     1   126    60     0    0.0  30.1  0.349   47
767     1    93    70    31    0.0  30.4  0.315   23

[768 rows x 8 columns], 0      tested_positive
1      tested_negative
2      tested_positive
3      tested_negative
4      tested_positive
            ...       
763    tested_negative
764    tested_negative
765    tested_negative
766    tested_positive
767    tested_negative
Name: class, Length: 768, dtype: category
Categories (2, object): ['tested_negative' 

In [7]:
info = dataset.get_data(dataset_format = "dataframe", target = dataset.default_target_attribute)

In [8]:
type(info)

tuple

# Analisando os dados capturados do dataset

In [9]:
info[0]

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148,72,35,0.0,33.6,0.627,50
1,1,85,66,29,0.0,26.6,0.351,31
2,8,183,64,0,0.0,23.3,0.672,32
3,1,89,66,23,94.0,28.1,0.167,21
4,0,137,40,35,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180.0,32.9,0.171,63
764,2,122,70,27,0.0,36.8,0.340,27
765,5,121,72,23,112.0,26.2,0.245,30
766,1,126,60,0,0.0,30.1,0.349,47


In [10]:
info[0]['preg'][0]

6

In [11]:
info[1][0]

'tested_positive'

# Transformando o dataset em DataFrame

In [12]:
df = pd.DataFrame(info[0])

In [13]:
type(df)

pandas.core.frame.DataFrame

In [14]:
a_renomear = {
    'preg': 'num_vezes_gestacao',
    'plas': 'plasma',
    'pres': 'pressao_distolica',
    'skin': 'dobra_triceps',
    'insu': 'insulina',
    'mass': 'massa',
    'pedi': 'funcao_pedigree',
    'age': 'idade'
}

In [15]:
df = df.rename(columns = a_renomear)

In [16]:
df.head(10)

Unnamed: 0,num_vezes_gestacao,plasma,pressao_distolica,dobra_triceps,insulina,massa,funcao_pedigree,idade
0,6,148,72,35,0.0,33.6,0.627,50
1,1,85,66,29,0.0,26.6,0.351,31
2,8,183,64,0,0.0,23.3,0.672,32
3,1,89,66,23,94.0,28.1,0.167,21
4,0,137,40,35,168.0,43.1,2.288,33
5,5,116,74,0,0.0,25.6,0.201,30
6,3,78,50,32,88.0,31.0,0.248,26
7,10,115,0,0,0.0,35.3,0.134,29
8,2,197,70,45,543.0,30.5,0.158,53
9,8,125,96,0,0.0,0.0,0.232,54


In [17]:
for i in range(len(df)):
    if info[1][i] == 'tested_negative':
        df.loc[i, "resultado"] = 0
    if info[1][i] == 'tested_positive':
        df.loc[i, "resultado"] = 1

In [18]:
df.head(10)

Unnamed: 0,num_vezes_gestacao,plasma,pressao_distolica,dobra_triceps,insulina,massa,funcao_pedigree,idade,resultado
0,6,148,72,35,0.0,33.6,0.627,50,1.0
1,1,85,66,29,0.0,26.6,0.351,31,0.0
2,8,183,64,0,0.0,23.3,0.672,32,1.0
3,1,89,66,23,94.0,28.1,0.167,21,0.0
4,0,137,40,35,168.0,43.1,2.288,33,1.0
5,5,116,74,0,0.0,25.6,0.201,30,0.0
6,3,78,50,32,88.0,31.0,0.248,26,1.0
7,10,115,0,0,0.0,35.3,0.134,29,0.0
8,2,197,70,45,543.0,30.5,0.158,53,1.0
9,8,125,96,0,0.0,0.0,0.232,54,1.0


# Transformando os dados do DataFrame em JSON

In [19]:
resultado_em_json = df.to_json()

In [20]:
aux = json.loads(resultado_em_json)

# Inserindo os dados da DataFrame no mongoDB

In [21]:
insercao_db = myCol.insert_one(aux)