# Importação das bibliotecas 

In [124]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder



# Configuração das bibliotecas

In [125]:
pd.reset_option('display.max_colwidth')

# importação dos CSV e transformando em datasets pandas

In [126]:
gruposDFnomeDasColunas = [
    "nome",
    "classificacao",
    "frequencia_feminina",
    "frequencia_masculina",
    "frequencia_total",
    "proporcao",
    "nomes_alternativos"    
]
gruposDF = pd.read_csv('grupos.csv', names=gruposDFnomeDasColunas, header=0)
gruposDF.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,nomes_alternativos
0,ALINE,F,528515,2035,530550,0.996164,|AALINE|AILINE|ALEINE|ALIINE|ALINE|ALINER|ALIN...
1,ARAO,M,0,3526,3526,1.0,|AARAO|ARAAO|ARAO|
2,ARON,M,0,3442,3442,1.0,|AARON|AHARON|AROM|ARON|ARYON|HARON|
3,ADA,F,5294,289,5583,0.948236,|ABA|ADA|ADAH|ADAR|ADHA|HADA|
4,ABADE,M,0,57,57,1.0,|ABADE|


In [127]:
nomesDSnomeDasColunas = [
    "nomes_alternativos",
    "classificacao",
    "primeiro_nome",
    "frequencia_feminina",
    "frequencia_masculina",
    "frequencia_total",
    "frequencia_grupo",
    "nome_grupo",
    "proporcao"
]
nomesDF= pd.read_csv("nomes.csv",names=nomesDSnomeDasColunas,header=0)
nomesDF.head()

Unnamed: 0,nomes_alternativos,classificacao,primeiro_nome,frequencia_feminina,frequencia_masculina,frequencia_total,frequencia_grupo,nome_grupo,proporcao
0,AILINE|ALEINE|ALIINE|ALINE|ALINER|ALINHE|ALINN...,F,AALINE,66.0,,66,530550,ALINE,1.0
1,ARAAO|ARAO,M,AARAO,,281.0,281,3526,ARAO,1.0
2,AHARON|AROM|ARON|ARYON|HARON,M,AARON,,676.0,676,3442,ARON,1.0
3,ADA|ADAH|ADAR|ADHA|HADA,F,ABA,82.0,,82,5583,ADA,1.0
4,,M,ABADE,,57.0,57,57,ABADE,1.0


# Limpeza de nulos e n/a

In [128]:
gruposDF.fillna(0, inplace=True)
nomesDF.fillna(0, inplace=True)

gruposDF.drop_duplicates(inplace=True)
nomesDF.drop_duplicates(inplace=True)

# Criando novos dados 

## Porcentagem de cada classe

In [129]:
gruposDF.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,nomes_alternativos
0,ALINE,F,528515,2035,530550,0.996164,|AALINE|AILINE|ALEINE|ALIINE|ALINE|ALINER|ALIN...
1,ARAO,M,0,3526,3526,1.0,|AARAO|ARAAO|ARAO|
2,ARON,M,0,3442,3442,1.0,|AARON|AHARON|AROM|ARON|ARYON|HARON|
3,ADA,F,5294,289,5583,0.948236,|ABA|ADA|ADAH|ADAR|ADHA|HADA|
4,ABADE,M,0,57,57,1.0,|ABADE|


In [130]:
gruposDF.drop(columns=["nomes_alternativos"],inplace=True)
nomesDF.drop(columns=["nomes_alternativos","frequencia_grupo","nome_grupo"],inplace=True)

In [131]:
gruposDF["porcentagem_feminina"]  = 0
gruposDF["porcentagem_masculina"] = 0
nomesDF["porcentagem_feminina"] = 0
nomesDF["porcentagem_masculina"] = 0

In [132]:
nomesDF.rename(columns=
               {"primeiro_nome": "nome"}
               ,inplace=True)

In [133]:
# for index, row in gruposDF.iterrows():
#     frequenciaMasculina =    round( row["frequencia_masculina"]  / row["frequencia_total"] , 7)
#     frequenciaFeminina =   round( row["frequencia_feminina"] / row["frequencia_total"], 7 )
#     gruposDF.loc[index,"porcentagem_feminina"] = frequenciaFeminina
#     gruposDF.loc[index,"porcentagem_masculina"] = frequenciaMasculina
    
gruposDF["porcentagem_masculina"] = round(gruposDF["frequencia_masculina"] / gruposDF["frequencia_total"], 7)
gruposDF["porcentagem_feminina"] =  round(gruposDF["frequencia_feminina"]  / gruposDF["frequencia_total"], 7)
    

In [134]:
gruposDF.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
0,ALINE,F,528515,2035,530550,0.996164,0.996164,0.003836
1,ARAO,M,0,3526,3526,1.0,0.0,1.0
2,ARON,M,0,3442,3442,1.0,0.0,1.0
3,ADA,F,5294,289,5583,0.948236,0.948236,0.051764
4,ABADE,M,0,57,57,1.0,0.0,1.0


In [135]:
nomesDF["porcentagem_masculina"] = round(nomesDF["frequencia_masculina"] / nomesDF["frequencia_total"], 7)
nomesDF["porcentagem_feminina"] = round(nomesDF["frequencia_feminina"] / nomesDF["frequencia_total"], 7)
    

In [136]:
nomesDF[(nomesDF["porcentagem_feminina"] == 0) & (nomesDF["porcentagem_masculina"] == 0)]

Unnamed: 0,classificacao,nome,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina


In [137]:
gruposDF[(gruposDF["porcentagem_feminina"] == 0) & (gruposDF["porcentagem_masculina"] == 0)]

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina


## colunas de cada letra

In [138]:
len(max(gruposDF['nome'], key=len))

14

In [139]:
gruposDF[gruposDF["nome"] == "ALICE"]

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
2789,ALICE,F,194940,1318,196258,0.993284,0.993284,0.006716


In [140]:
nomesDF[nomesDF["nome"] == "ALICE"]

Unnamed: 0,classificacao,nome,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
4149,F,ALICE,191430.0,830.0,192260,0.995683,0.995683,0.004317


In [141]:
nomesDF.head()

Unnamed: 0,classificacao,nome,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
0,F,AALINE,66.0,0.0,66,1.0,1.0,0.0
1,M,AARAO,0.0,281.0,281,1.0,0.0,1.0
2,M,AARON,0.0,676.0,676,1.0,0.0,1.0
3,F,ABA,82.0,0.0,82,1.0,1.0,0.0
4,M,ABADE,0.0,57.0,57,1.0,0.0,1.0


In [142]:
gruposDF.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
0,ALINE,F,528515,2035,530550,0.996164,0.996164,0.003836
1,ARAO,M,0,3526,3526,1.0,0.0,1.0
2,ARON,M,0,3442,3442,1.0,0.0,1.0
3,ADA,F,5294,289,5583,0.948236,0.948236,0.051764
4,ABADE,M,0,57,57,1.0,0.0,1.0


In [143]:
nomesDF.set_index('nome', inplace=True)
gruposDF.set_index('nome', inplace=True)

In [144]:
data = gruposDF.combine_first(nomesDF).reset_index()

In [145]:
data.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina
0,AABRAO,M,0.0,26.0,26,1.0,0.0,1.0
1,AADRIANA,F,94.0,0.0,94,1.0,1.0,0.0
2,AADRIANO,M,0.0,53.0,53,1.0,0.0,1.0
3,AAILTON,M,0.0,23.0,23,1.0,0.0,1.0
4,AALAN,M,0.0,27.0,27,1.0,0.0,1.0


In [146]:
def extrair_letras(nome, n):
    return nome[n] if len(nome) > n else ''

for i in range(14):
    coluna = f'letra_{i+1}'
    data[coluna] = data['nome'].apply(lambda x: extrair_letras(x, i))

data.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina,letra_1,letra_2,...,letra_5,letra_6,letra_7,letra_8,letra_9,letra_10,letra_11,letra_12,letra_13,letra_14
0,AABRAO,M,0.0,26.0,26,1.0,0.0,1.0,A,A,...,A,O,,,,,,,,
1,AADRIANA,F,94.0,0.0,94,1.0,1.0,0.0,A,A,...,I,A,N,A,,,,,,
2,AADRIANO,M,0.0,53.0,53,1.0,0.0,1.0,A,A,...,I,A,N,O,,,,,,
3,AAILTON,M,0.0,23.0,23,1.0,0.0,1.0,A,A,...,T,O,N,,,,,,,
4,AALAN,M,0.0,27.0,27,1.0,0.0,1.0,A,A,...,N,,,,,,,,,


In [147]:
colunas_letras = [col for col in data.columns if col.startswith('letra_')]
dummies_letras = pd.get_dummies(data[colunas_letras])
dummies_letras.head()

Unnamed: 0,letra_1_A,letra_1_B,letra_1_C,letra_1_D,letra_1_E,letra_1_F,letra_1_G,letra_1_H,letra_1_I,letra_1_J,...,letra_13_A,letra_13_D,letra_13_E,letra_13_I,letra_13_N,letra_13_O,letra_13_R,letra_13_S,letra_14_,letra_14_N
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [148]:
data = pd.concat([data, dummies_letras], axis=1)

data.head()

Unnamed: 0,nome,classificacao,frequencia_feminina,frequencia_masculina,frequencia_total,proporcao,porcentagem_feminina,porcentagem_masculina,letra_1,letra_2,...,letra_13_A,letra_13_D,letra_13_E,letra_13_I,letra_13_N,letra_13_O,letra_13_R,letra_13_S,letra_14_,letra_14_N
0,AABRAO,M,0.0,26.0,26,1.0,0.0,1.0,A,A,...,0,0,0,0,0,0,0,0,1,0
1,AADRIANA,F,94.0,0.0,94,1.0,1.0,0.0,A,A,...,0,0,0,0,0,0,0,0,1,0
2,AADRIANO,M,0.0,53.0,53,1.0,0.0,1.0,A,A,...,0,0,0,0,0,0,0,0,1,0
3,AAILTON,M,0.0,23.0,23,1.0,0.0,1.0,A,A,...,0,0,0,0,0,0,0,0,1,0
4,AALAN,M,0.0,27.0,27,1.0,0.0,1.0,A,A,...,0,0,0,0,0,0,0,0,1,0


In [149]:
data.to_csv("data.csv")

In [None]:
from sklearn.model_selection import GridSearchCV

# Definir os parâmetros para o GridSearch
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')

# Treinar o GridSearchCV
grid_search.fit(X_train, y_train)

# Melhor número de vizinhos e modelo
best_knn = grid_search.best_estimator_
print('Melhor número de vizinhos:', grid_search.best_params_['n_neighbors'])

# Fazer previsões e calcular acurácia com o melhor modelo
y_pred = best_knn.predict(X_test)
print('Acurácia com GridSearchCV:', accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Carregar o arquivo CSV
data = pd.read_csv('data.csv')

# Selecionar apenas as colunas de letras
colunas_letras = [col for col in data.columns if col.startswith('letra_')]

# Criar dummies para as colunas de letras
dummies_letras = pd.get_dummies(data[colunas_letras])
# Concatenar dummies com as colunas numéricas do DataFrame original
# Excluir colunas não numéricas que não foram transformadas
X = pd.concat([data.drop(['classificacao', 'nome'] + colunas_letras, axis=1), dummies_letras], axis=1)
y = data['classificacao']  # Target

# Codificar a variável target se necessário
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividir o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalizar os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Criar e treinar o modelo K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=best_knn)  # Você pode ajustar o número de vizinhos
knn.fit(X_train, y_train)

# Fazer previsões e calcular a acurácia
y_pred = knn.predict(X_test)
print('Acurácia:', accuracy_score(y_test, y_pred))

In [None]:
# Suponha que temos um nome de input
input_nome = "EXEMPLO"

# Transformar o nome em dummies
input_letras = pd.DataFrame([list(input_nome)], columns=[f'letra_{i+1}' for i in range(len(input_nome))])
input_dummies = pd.get_dummies(input_letras)
input_dummies

In [None]:
missing_cols = list(set(X.columns) - set(input_dummies.columns))
missing_dummies = pd.DataFrame(0, index=input_dummies.index, columns=missing_cols)

# Concatenar as dummies existentes com as dummies ausentes
input_dummies = pd.concat([input_dummies, missing_dummies], axis=1)

In [None]:

# Reordenar as colunas para garantir que a ordem seja a mesma do treino
input_dummies = input_dummies[X.columns]

In [None]:
input_normalizado = scaler.transform(input_dummies)

In [None]:
predicao = knn.predict(input_normalizado)

In [None]:
label_encoder.fit(y)
classe_predita = label_encoder.inverse_transform(predicao)

In [None]:
print('Classe predita:', classe_predita[0])

In [None]:
def testaModelo(input_nome):
    # Transformar o nome em dummies
    input_letras = pd.DataFrame([list(input_nome)], columns=[f'letra_{i+1}' for i in range(len(input_nome))])
    input_dummies = pd.get_dummies(input_letras)
    missing_cols = list(set(X.columns) - set(input_dummies.columns))
    missing_dummies = pd.DataFrame(0, index=input_dummies.index, columns=missing_cols)

    # Concatenar as dummies existentes com as dummies ausentes
    input_dummies = pd.concat([input_dummies, missing_dummies], axis=1)

    # Reordenar as colunas para garantir que a ordem seja a mesma do treino
    input_dummies = input_dummies[X.columns]
    input_normalizado = scaler.transform(input_dummies)
    predicao = knn.predict(input_normalizado)
    label_encoder.fit(y)
    classe_predita = label_encoder.inverse_transform(predicao)
    print(f"input classificado como {classe_predita})