In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Carregar os dados (substitua 'SuaÚltimaColuna' pelo nome real da última coluna)
file_path = './dados_funcionais_patenteadores.xlsx'
df = pd.read_excel(file_path)

# Selecione a coluna alvo e as variáveis de entrada
target_column = 'Classification regarding professional orientation'  # Altere para o nome da última coluna
X = df.drop(columns=[target_column])
y = df[target_column]

# Pré-processamento: normalização de variáveis numéricas e codificação de variáveis categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),  # Normaliza dados numéricos
        ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)  # Codifica dados categóricos
    ])

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar o pipeline que aplica o pré-processamento e o modelo MLP
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mlp', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42))
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = pipeline.predict(X_test)

# Avaliar a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy}')


Acurácia: 0.6296296296296297


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score

# Carregar os dados
file_path = './dados_funcionais_patenteadores.xlsx'
df = pd.read_excel(file_path)

# Selecione a coluna alvo e as variáveis de entrada
target_column = 'Classification regarding professional orientation'  # Altere para o nome da última coluna
X = df.drop(columns=[target_column])
y = df[target_column]

# Pré-processamento: normalização de variáveis numéricas e codificação de variáveis categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),  # Normaliza dados numéricos
        ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)  # Codifica dados categóricos
    ])

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar o pipeline que aplica o pré-processamento e o modelo MLP
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mlp', MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=2000, random_state=42, solver='adam', activation='relu'))
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = pipeline.predict(X_test)

# Avaliar a acurácia do modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy}')


# Validar o modelo com cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=3, scoring='accuracy')
print(f"Acurácia média com validação cruzada: {cv_scores.mean()}")

# --- Exibir os resultados das previsões e perfis reais ---
# Criando uma tabela com as classes reais e previstas
results_df = pd.DataFrame({
    'Real': y_test,
    'Previsto': y_pred
})

# Exibir as primeiras 10 linhas
print("\nResultados das Previsões (Classes Reais vs Previsões):")
print(results_df.head(10))  # Exibe as 10 primeiras linhas para ver o resultado

# Exibir o número total de acertos para cada classe
print("\nNúmero de acertos por classe:")
print(results_df[results_df['Real'] == results_df['Previsto']].groupby('Real').size())

# Exibir as previsões erradas
print("\nPrevisões Erradas (Real vs Previsto):")
print(results_df[results_df['Real'] != results_df['Previsto']])


Acurácia: 0.5555555555555556
Acurácia média com validação cruzada: 0.5496828752642706

Resultados das Previsões (Classes Reais vs Previsões):
                      Real         Previsto
55   Pragmatic traditional  Entrepreneurial
40   Pragmatic traditional  Entrepreneurial
19         Entrepreneurial           Hybrid
31   Pragmatic traditional           Hybrid
115                 Hybrid           Hybrid
56                  Hybrid           Hybrid
69                  Hybrid  Entrepreneurial
105        Entrepreneurial  Entrepreneurial
81         Entrepreneurial  Entrepreneurial
26         Entrepreneurial           Hybrid

Número de acertos por classe:
Real
Entrepreneurial     5
Hybrid             10
dtype: int64

Previsões Erradas (Real vs Previsto):
                      Real               Previsto
55   Pragmatic traditional        Entrepreneurial
40   Pragmatic traditional        Entrepreneurial
19         Entrepreneurial                 Hybrid
31   Pragmatic traditional                

In [21]:
import tensorflow as tf
import numpy as np
import random

# Definir a semente para garantir reprodutibilidade
seed_value = 42

# Definir semente do Python
random.seed(seed_value)

# Definir semente do NumPy
np.random.seed(seed_value)

# Definir semente do TensorFlow
tf.random.set_seed(seed_value)

# Importações do código original
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Carregar os dados
df = pd.read_excel('./dados_funcionais_patenteadores.xlsx')

# Selecione a coluna alvo e as variáveis de entrada
X = df.drop(columns=['Nature of motivation'])
y = df['Nature of motivation']

# Selecionar apenas as colunas numéricas para normalização
X_numeric = X.select_dtypes(include=['float64', 'int64'])

# Normalização dos dados numéricos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Selecionar as colunas categóricas para codificação
X_categorical = X.select_dtypes(include=['object'])

# Codificação das variáveis categóricas com OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Corrigido para 'sparse_output=False'
X_encoded = encoder.fit_transform(X_categorical)

# Concatenar as variáveis numéricas normalizadas com as variáveis categóricas codificadas
X_final = np.concatenate((X_scaled, X_encoded), axis=1)

# Convertendo as variáveis alvo (y_train e y_test) para inteiros
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, test_size=0.2, random_state=42)

# --- Construção do Autoencoder ---
input_dim = X_train.shape[1]  # Número total de características de entrada
encoding_dim = 64  # Dimensão da camada comprimida (representação compacta)

# Definir o Autoencoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(encoding_dim, activation='relu')(encoded)

decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

# Modelo Autoencoder
autoencoder = Model(input_layer, decoded)

# Modelo Encoder (para obter a codificação comprimida)
encoder_model = Model(input_layer, encoded)

# Compilação do Autoencoder
autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')

# Treinar o Autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# --- Obter as Representações Comprimidas com o Encoder ---
X_train_encoded = encoder_model.predict(X_train)
X_test_encoded = encoder_model.predict(X_test)

# --- Construção do MLP ---
# O MLP agora vai usar as representações comprimidas como entrada
mlp_model = Sequential([
    Dense(128, activation='relu', input_dim=encoding_dim),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y_encoded)), activation='softmax')  # Para classificação multi-classe
])

# Compilar o modelo MLP
mlp_model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Treinar o MLP
mlp_model.fit(X_train_encoded, y_train, epochs=50, batch_size=32, validation_data=(X_test_encoded, y_test))

# Avaliar a acurácia do MLP
y_pred = mlp_model.predict(X_test_encoded)
y_pred_classes = y_pred.argmax(axis=-1)  # Converter as probabilidades para classes

# Calcular a acurácia
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Acurácia do modelo com Autoencoder + MLP: {accuracy}')


# --- Exibir os resultados das previsões ---
# Criando uma tabela com as classes reais e previstas
results_df = pd.DataFrame({
    'Real': label_encoder.inverse_transform(y_test),
    'Previsto': label_encoder.inverse_transform(y_pred_classes)
})

print("\nResultados das Previsões (Classes Reais vs Previsões):")
print(results_df.head(10))  # Exibe as 10 primeiras linhas para ver o resultado

# Exibir o número total de acertos para cada classe
print("\nNúmero de acertos por classe:")
print(results_df[results_df['Real'] == results_df['Previsto']].groupby('Real').size())

# Exibir as previsões erradas
print("\nPrevisões Erradas (Real vs Previsto):")
print(results_df[results_df['Real'] != results_df['Previsto']])


Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.2673 - val_loss: 0.2585
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.2630 - val_loss: 0.2505
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.2532 - val_loss: 0.2306
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.2294 - val_loss: 0.1865
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.1792 - val_loss: 0.1141
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.1063 - val_loss: 0.0583
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0613 - val_loss: 0.0472
Epoch 8/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0537 - val_loss: 0.0465
Epoch 9/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.3141 - loss: 1.2979 - val_accuracy: 0.2222 - val_loss: 1.1622
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4063 - loss: 1.0965 - val_accuracy: 0.2963 - val_loss: 1.1552
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4391 - loss: 1.0413 - val_accuracy: 0.3704 - val_loss: 1.0977
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4635 - loss: 1.0238 - val_accuracy: 0.4815 - val_loss: 1.0718
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5115 - loss: 1.0182 - val_accuracy: 0.5926 - val_loss: 1.0630
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5352 - loss: 0.9860 - val_accuracy: 0.5556 - val_loss: 1.0541
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m