# Importação das bibliotecas

In [135]:
#Importações de bibliotecas

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder



# Configuração das bibliotecas

In [136]:
pd.reset_option('display.max_colwidth')

# importação dos CSV e transformando em datasets pandas


In [137]:
gruposDFnomeDasColunas = [
    "nome",
    "classificacao",
    "frequencia_feminina",
    "frequencia_masculina",
    "frequencial_total",
    "proporcao",
    "nomes_alternativos"    
]
gruposDF = pd.read_csv('grupos.csv', names=gruposDFnomeDasColunas, header=0)
gruposDF.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencial_total,proporcao,nomes_alternativos
0,ALINE,F,528515,2035,530550,0.996164,|AALINE|AILINE|ALEINE|ALIINE|ALINE|ALINER|ALIN...
1,ARAO,M,0,3526,3526,1.0,|AARAO|ARAAO|ARAO|
2,ARON,M,0,3442,3442,1.0,|AARON|AHARON|AROM|ARON|ARYON|HARON|
3,ADA,F,5294,289,5583,0.948236,|ABA|ADA|ADAH|ADAR|ADHA|HADA|
4,ABADE,M,0,57,57,1.0,|ABADE|


In [138]:
nomesDSnomeDasColunas = [
    "nomes_alternativos",
    "classificacao",
    "primeiro_nome",
    "frequencia_feminina",
    "frequencia_masculina",
    "frequencia_total",
    "frequencia_grupo",
    "nome_grupo",
    "proporcao"
]
nomesDF= pd.read_csv("nomes.csv",names=nomesDSnomeDasColunas,header=0)
nomesDF.head()

Unnamed: 0,nomes_alternativos,classificacao,primeiro_nome,frequencia_feminina,frequencia_masculina,frequencia_total,frequencia_grupo,nome_grupo,proporcao
0,AILINE|ALEINE|ALIINE|ALINE|ALINER|ALINHE|ALINN...,F,AALINE,66.0,,66,530550,ALINE,1.0
1,ARAAO|ARAO,M,AARAO,,281.0,281,3526,ARAO,1.0
2,AHARON|AROM|ARON|ARYON|HARON,M,AARON,,676.0,676,3442,ARON,1.0
3,ADA|ADAH|ADAR|ADHA|HADA,F,ABA,82.0,,82,5583,ADA,1.0
4,,M,ABADE,,57.0,57,57,ABADE,1.0


# Limpeza de nulos e n/a

In [139]:
gruposDF.fillna(0, inplace=True)
nomesDF.fillna(0, inplace=True)

gruposDF.drop_duplicates(inplace=True)
nomesDF.drop_duplicates(inplace=True)

# Unir dataframes

In [140]:
gruposSelecionado= gruposDF[['nome', 'frequencia_feminina', 'frequencia_masculina', 'classificacao']]
nomesSelecionado = nomesDF[['primeiro_nome', 'frequencia_feminina', 'frequencia_masculina', 'classificacao']]

In [141]:
nomesSelecionado = nomesSelecionado.rename(columns={'primeiro_nome': 'nome'})

In [142]:
nomesSelecionado.head()

Unnamed: 0,nome,frequencia_feminina,frequencia_masculina,classificacao
0,AALINE,66.0,0.0,F
1,AARAO,0.0,281.0,M
2,AARON,0.0,676.0,M
3,ABA,82.0,0.0,F
4,ABADE,0.0,57.0,M


In [143]:
gruposSelecionado.head()

Unnamed: 0,nome,frequencia_feminina,frequencia_masculina,classificacao
0,ALINE,528515,2035,F
1,ARAO,0,3526,M
2,ARON,0,3442,M
3,ADA,5294,289,F
4,ABADE,0,57,M


In [144]:
data =  pd.merge(gruposSelecionado, nomesSelecionado, on='nome', how='outer')

In [145]:
data.drop_duplicates(subset='nome', keep='first', inplace=True)

# Formatação e encoding

In [146]:
data['frequenciaFeminina'] = data['frequencia_feminina_x'] + data['frequencia_feminina_y']
data['frequenciaMasculina'] = data['frequencia_masculina_x'] + data['frequencia_masculina_y']
data.head()

Unnamed: 0,nome,frequencia_feminina_x,frequencia_masculina_x,classificacao_x,frequencia_feminina_y,frequencia_masculina_y,classificacao_y,frequenciaFeminina,frequenciaMasculina
0,ALINE,528515.0,2035.0,F,509869.0,1868.0,F,1038384.0,3903.0
1,ARAO,0.0,3526.0,M,0.0,3078.0,M,0.0,6604.0
2,ARON,0.0,3442.0,M,0.0,2269.0,M,0.0,5711.0
3,ADA,5294.0,289.0,F,5029.0,266.0,F,10323.0,555.0
4,ABADE,0.0,57.0,M,0.0,57.0,M,0.0,114.0


In [147]:
# Manter apenas as colunas de interesse
data = data[['nome', 'frequenciaFeminina', 'frequenciaMasculina', 'classificacao_x']]
data.rename(columns={'classificacao_x': 'classificacao'}, inplace=True)
data.head()

Unnamed: 0,nome,frequenciaFeminina,frequenciaMasculina,classificacao
0,ALINE,1038384.0,3903.0,F
1,ARAO,0.0,6604.0,M
2,ARON,0.0,5711.0,M
3,ADA,10323.0,555.0,F
4,ABADE,0.0,114.0,M


In [148]:
labelEncoder = LabelEncoder()
data['classificacaoCodificada'] = labelEncoder.fit_transform(data['classificacao'])
data.head()

Unnamed: 0,nome,frequenciaFeminina,frequenciaMasculina,classificacao,classificacaoCodificada
0,ALINE,1038384.0,3903.0,F,0
1,ARAO,0.0,6604.0,M,1
2,ARON,0.0,5711.0,M,1
3,ADA,10323.0,555.0,F,0
4,ABADE,0.0,114.0,M,1


In [149]:
#F = 0
#M = 1
data.drop(columns=["classificacao"], inplace=True)

# Salvar CSV

In [150]:
data.to_csv('data.csv', index=False)