In [46]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

with open("exemplos_detalhados_tecnologias_mvp.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
text_embeddings = model.encode(df["texto"].tolist())
tech_embeddings = model.encode(df["tecnologia"].tolist())
y = df["score"].values
X = np.concatenate([text_embeddings, tech_embeddings], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

ridge = Ridge()
mlp = MLPRegressor(max_iter=300, random_state=42)
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)

# Ridge
param_grid_ridge = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}
grid_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_ridge.fit(X_train, y_train)
best_ridge = grid_ridge.best_estimator_

# MLP
param_grid_mlp = {
    'hidden_layer_sizes': [(32,), (64,), (128,), (256,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.01, 0.1, 1, 10],
    'learning_rate': ['constant', 'adaptive']
}
grid_mlp = GridSearchCV(mlp, param_grid_mlp, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_mlp.fit(X_train, y_train)
best_mlp = grid_mlp.best_estimator_

# Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

# Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'min_samples_split': [2, 5]
}
grid_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_

print("\nMelhores parâmetros encontrados para cada modelo:")
print(f"Melhores parâmetros Ridge: {grid_ridge.best_params_}")
print(f"Melhores parâmetros MLP: {grid_mlp.best_params_}")
print(f"Melhores parâmetros Random Forest: {grid_rf.best_params_}")
print(f"Melhores parâmetros Gradient Boosting: {grid_gb.best_params_}")

# MSE
y_pred_ridge = best_ridge.predict(X_test)
y_pred_mlp = best_mlp.predict(X_test)
y_pred_rf = best_rf.predict(X_test)
y_pred_gb = best_gb.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_gb = mean_squared_error(y_test, y_pred_gb)

print("\nMSE de cada modelo:")
print(f"MSE Ridge: {mse_ridge:.2f}")
print(f"MSE MLP: {mse_mlp:.2f}")
print(f"MSE Random Forest: {mse_rf:.2f}")
print(f"MSE Gradient Boosting: {mse_gb:.2f}")


Melhores parâmetros encontrados para cada modelo:
Melhores parâmetros Ridge: {'alpha': 0.01}
Melhores parâmetros MLP: {'activation': 'tanh', 'alpha': 1, 'hidden_layer_sizes': (128,), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Melhores parâmetros Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Melhores parâmetros Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.9}

MSE de cada modelo:
MSE Ridge: 143.72
MSE MLP: 150.27
MSE Random Forest: 116.81
MSE Gradient Boosting: 79.05


In [None]:
novo_texto = "Atuei com manutenção de sistemas legados escritos em C#, fazendo correções e pequenas melhorias em módulos existentes. E esenvolvi aplicações web com ASP.NET MVC, implementando controle de autenticação, integração com banco de dados via Entity Framework e padrões de projeto."
nova_tecnologia = "c#"

embedding_texto = model.encode([novo_texto])
embedding_tecnologia = model.encode([nova_tecnologia])

embedding_novo = np.concatenate([embedding_texto, embedding_tecnologia], axis=1)

score_ridge = best_ridge.predict(embedding_novo)[0]
score_mlp = best_mlp.predict(embedding_novo)[0]
score_rf = best_rf.predict(embedding_novo)[0]
score_gb = best_gb.predict(embedding_novo)[0]

print("Pontuações previstas para o novo texto:")
print(f"Ridge:             {score_ridge:.2f}")
print(f"MLP:               {score_mlp:.2f}")
print(f"Random Forest:     {score_rf:.2f}")
print(f"Gradient Boosting: {score_gb:.2f}")

Pontuações previstas para o novo texto:
Ridge:             71.44
MLP:               74.33
Random Forest:     74.67
Gradient Boosting: 81.35


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Pontuações previstas para o novo texto:
Ridge:             75.31
MLP:               75.44
Random Forest:     75.34
Gradient Boosting: 78.59

Erro Quadrático Médio (MSE) de cada modelo:
MSE Ridge: 206.11
MSE MLP: 207.98
MSE Random Forest: 163.72
MSE Gradient Boosting: 133.70

Melhores parâmetros encontrados para cada modelo:
Melhores parâmetros Ridge: {'alpha': 0.1}
Melhores parâmetros MLP: {'activation': 'relu', 'alpha': 0.1, 'hidden_layer_sizes': (64,), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Melhores parâmetros Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Melhores parâmetros Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 0.9}
