In [None]:
import json, os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

def concatenar_listas_json(pasta):
    lista_concatenada = []
    for nome_arquivo in os.listdir(pasta):
        if nome_arquivo.endswith('.json'):
            caminho_arquivo = os.path.join(pasta, nome_arquivo)
            with open(caminho_arquivo, 'r', encoding='utf-8') as f:
                try:
                    dados = json.load(f)
                    if isinstance(dados, list):
                        lista_concatenada.extend(dados)
                except json.JSONDecodeError as e:
                    print(f"Erro ao ler {nome_arquivo}: {e}")

    return lista_concatenada

pasta = r'.\aceitos' 
data = concatenar_listas_json(pasta)

df = pd.DataFrame(data)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
text_embeddings = model.encode(df["texto"].tolist())
tech_embeddings = model.encode(df["tecnologia"].tolist())
y = df["score"].values
X = np.concatenate([text_embeddings, tech_embeddings], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

mlp = MLPRegressor(max_iter=300, random_state=42)

# MLP
param_grid_mlp = {
    'hidden_layer_sizes': [(32,), (64,), (128,), (256,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.01, 0.1, 1, 10],
    'learning_rate': ['constant', 'adaptive']
}
grid_mlp = GridSearchCV(mlp, param_grid_mlp, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_mlp.fit(X_train, y_train)
best_mlp = grid_mlp.best_estimator_

print(f"Melhores parâmetros MLP: {grid_mlp.best_params_}")

# MSE
y_pred_mlp = best_mlp.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MSE MLP: {mse_mlp:.2f}")

Melhores parâmetros MLP: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128,), 'learning_rate': 'constant', 'solver': 'lbfgs'}
MSE MLP: 115.23


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
novo_texto = "Criei data lakes e supri dados para uma aplicação utilizando python, mysql e bigquery que envolve aprendizado de bigdata."
nova_tecnologia = "python"

embedding_texto = model.encode([novo_texto])
embedding_tecnologia = model.encode([nova_tecnologia])
embedding_novo = np.concatenate([embedding_texto, embedding_tecnologia], axis=1)
score_mlp = best_mlp.predict(embedding_novo)[0]

print(f"MLP: {score_mlp:.2f}")

MLP: 73.74


In [None]:
import joblib

joblib.dump(best_mlp, 'modelo_mlp_treinado.pkl')

In [None]:
modelo_carregado = joblib.load('modelo_mlp_treinado.pkl')