# Dataset:  [Social Media Usage and Emotional Well-Being](https://www.kaggle.com/datasets/emirhanai/social-media-usage-and-emotional-well-being)
- Tópicos Especiais - Sistemas para Internet - IFPB
- Data: 11/03/2025
- Análise por Allan Alves Amâncio (no Google Colab)

Target: *Dominant_Emotion*

In [None]:
import kagglehub
import os
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn import metrics, tree
from sklearn.compose import make_column_transformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
import time

path = "/root/.cache/kagglehub/datasets/emirhanai/social-media-usage-and-emotional-well-being/versions/1"
file_path = os.path.join(path, "train.csv")

if not os.path.exists(file_path):
    print("Fazendo download do dataset...")
    path = kagglehub.dataset_download("emirhanai/social-media-usage-and-emotional-well-being")
else:
    print("Dataset já baixado. Continuando...")

df = pd.read_csv(file_path)

df.drop(columns=['User_ID', 'Platform'], inplace=True, axis=1)

df = df[df["Gender"].isin(["Male", "Female", "Non-binary"])]

label_encoder = LabelEncoder()
df["Dominant_Emotion"] = label_encoder.fit_transform(df["Dominant_Emotion"])

column_transformer_one_hot = make_column_transformer(
    (OneHotEncoder(), ["Gender"]),
    remainder='passthrough'
)

numeric_cols = df.columns.difference(["Dominant_Emotion", "Gender"])
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df_transformed = column_transformer_one_hot.fit_transform(df)
new_column_names = column_transformer_one_hot.get_feature_names_out()
df = pd.DataFrame(df_transformed, columns=new_column_names)

dominant_emotion_col = [col for col in df.columns if "remainder__Dominant_Emotion" in col][0]
df[dominant_emotion_col] = df[dominant_emotion_col].astype(int)

dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

y = df[dominant_emotion_col]
X = df.drop(columns=[dominant_emotion_col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

print("\nALGORITMOS A EXECUTAR E COMPARAR")

kf = KFold(n_splits=10, shuffle=True, random_state=42)

print("\n1. Árvores de Decisão (Gini e Entropy):")

tree_model_gini = tree.DecisionTreeClassifier(criterion="gini")
tree_model_entropy = tree.DecisionTreeClassifier(criterion="entropy")

scores_tree_model_gini = cross_val_score(tree_model_gini, X, y, cv=kf, scoring='accuracy')
scores_tree_model_entropy = cross_val_score(tree_model_entropy, X, y, cv=kf, scoring='accuracy')

mean_accuracy_tree_model_gini = np.mean(scores_tree_model_gini) * 100
mean_accuracy_tree_model_entropy = np.mean(scores_tree_model_entropy) * 100

print(f"\nMédia de acurácia (10 folds) com Gini: {mean_accuracy_tree_model_gini:.2f}%")
print(f"\nMédia de acurácia (10 folds) com Entropy: {mean_accuracy_tree_model_entropy:.2f}%")

print("\n2. kNN (k igual a 5 e 10):")

knn_model_five_neighbors = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='brute')
knn_model_ten_neighbors = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='brute')

scores_knn_model_five_neighbors = cross_val_score(knn_model_five_neighbors, X, y, cv=kf, scoring='accuracy')
scores_knn_model_ten_neighbors = cross_val_score(knn_model_ten_neighbors, X, y, cv=kf, scoring='accuracy')

mean_accuracy_knn_model_five_neighbors = np.mean(scores_knn_model_five_neighbors) * 100
mean_accuracy_knn_model_ten_neighbors = np.mean(scores_knn_model_ten_neighbors) * 100

print(f"\nMédia de acurácia (10 folds) com kNN (k = 5 vizinhos): {mean_accuracy_knn_model_five_neighbors:.2f}%")
print(f"\nMédia de acurácia (10 folds) com kNN (k = 10 vizinhos): {mean_accuracy_knn_model_ten_neighbors:.2f}%")

print("\n3. MLP com duas arquiteturas diferentes (e com funções de ativação 'Tanh' e 'ReLU' em cada arquitetura):")

mlp_model_one_tanh = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='tanh', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.03)
mlp_model_one_relu = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.03)

mlp_model_two_tanh = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='tanh', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)
mlp_model_two_relu = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)

start_time = time.time()
scores_mlp_model_one_tanh = cross_val_score(mlp_model_one_tanh, X, y, cv=kf, scoring='accuracy')
elapsed_time_one_tanh = time.time() - start_time
mean_accuracy_mlp_model_one_tanh = np.mean(scores_mlp_model_one_tanh) * 100
print(f"\nMédia de acurácia (10 folds) com MLP 1 (tanh): {mean_accuracy_mlp_model_one_tanh:.2f}% (Tempo: {elapsed_time_one_tanh:.2f}s)")

start_time = time.time()
scores_mlp_model_one_relu = cross_val_score(mlp_model_one_relu, X, y, cv=kf, scoring='accuracy')
elapsed_time_one_relu = time.time() - start_time
mean_accuracy_mlp_model_one_relu = np.mean(scores_mlp_model_one_relu) * 100
print(f"\nMédia de acurácia (10 folds) com MLP 1 (relu): {mean_accuracy_mlp_model_one_relu:.2f}% (Tempo: {elapsed_time_one_relu:.2f}s)")

start_time = time.time()
scores_mlp_model_two_tanh = cross_val_score(mlp_model_two_tanh, X, y, cv=kf, scoring='accuracy')
elapsed_time_two_tanh = time.time() - start_time
mean_accuracy_mlp_model_two_tanh = np.mean(scores_mlp_model_two_tanh) * 100
print(f"\nMédia de acurácia (10 folds) com MLP 2 (tanh): {mean_accuracy_mlp_model_two_tanh:.2f}% (Tempo: {elapsed_time_two_tanh:.2f}s)")

start_time = time.time()
scores_mlp_model_two_relu = cross_val_score(mlp_model_two_relu, X, y, cv=kf, scoring='accuracy')
elapsed_time_two_relu = time.time() - start_time
mean_accuracy_mlp_model_two_relu = np.mean(scores_mlp_model_two_relu) * 100
print(f"\nMédia de acurácia (10 folds) com MLP 2 (relu): {mean_accuracy_mlp_model_two_relu:.2f}% (Tempo: {elapsed_time_two_relu:.2f}s)")

print("\n4. K-Means (K igual ao número de classes existente no problema):")

num_clusters = len(np.unique(y))
print(f"\nNúmero de clusters escolhido para K-Means: {num_clusters}")

ari_scores_kmeans_model = []
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    kmeans_model.fit(X_train)

    y_pred_kmeans = kmeans_model.predict(X_test)

    ari = adjusted_rand_score(y_test, y_pred_kmeans)
    ari_scores_kmeans_model.append(ari)

    labels = np.zeros_like(y_pred_kmeans)
    for i in range(num_clusters):
        mask = (y_pred_kmeans == i)
        labels[mask] = mode(y_test[mask])[0]

    accuracy = np.mean(labels == y_test)
    accuracies.append(accuracy)

mean_ari_kmeans = np.mean(ari_scores_kmeans_model)
mean_accuracy_kmeans_model = np.mean(accuracies) * 100

print(f"\nMédia de ARI (10 folds): {mean_ari_kmeans:.4f}")
print(f"\nMédia de Acurácia (10 folds): {mean_accuracy_kmeans_model:.2f}%")

import matplotlib.pyplot as plt

def plot_mlp_training_loss(histories, labels):
    plt.figure(figsize=(10, 6))
    for history, label in zip(histories, labels):
        plt.plot(history.loss_curve_, label=label)
    
    plt.xlabel("Épocas")
    plt.ylabel("Erro de Treinamento")
    plt.title("Taxa de Erro de Treinamento por Época (MLPs)")
    plt.legend()
    plt.grid()
    plt.show()

mlp_one_tanh = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='tanh', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.03)
mlp_one_relu = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.03)
mlp_two_tanh = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='tanh', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)
mlp_two_relu = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)

mlp_one_tanh.fit(X_train, y_train)
mlp_one_relu.fit(X_train, y_train)
mlp_two_tanh.fit(X_train, y_train)
mlp_two_relu.fit(X_train, y_train)

plot_mlp_training_loss([
    mlp_one_tanh, mlp_one_relu, mlp_two_tanh, mlp_two_relu
], labels=[
    "MLP 1 (tanh)", "MLP 1 (relu)", "MLP 2 (tanh)", "MLP 2 (relu)"
])

print("\nTabela das taxas de acerto e erro dos algoritmos")

data = {
    "Algoritmo": [
        "Árvore de Decisão (Gini)", "Árvore de Decisão (Entropy)",
        "kNN (k=5)", "kNN (k=10)",
        "MLP 1 (tanh)", "MLP 1 (relu)", "MLP 2 (tanh)", "MLP 2 (relu)",
        "K-Means"
    ],
    "Acurácia Média (%)": [
        mean_accuracy_tree_model_gini,
        mean_accuracy_tree_model_entropy,
        mean_accuracy_knn_model_five_neighbors,
        mean_accuracy_knn_model_ten_neighbors,
        mean_accuracy_mlp_model_one_tanh,
        mean_accuracy_mlp_model_one_relu,
        mean_accuracy_mlp_model_two_tanh,
        mean_accuracy_mlp_model_two_relu,
        mean_accuracy_kmeans_model
    ],
    "Taxa de Erro (%)": [
        100 - mean_accuracy_tree_model_gini,
        100 - mean_accuracy_tree_model_entropy,
        100 - mean_accuracy_knn_model_five_neighbors,
        100 - mean_accuracy_knn_model_ten_neighbors,
        100 - mean_accuracy_mlp_model_one_tanh,
        100 - mean_accuracy_mlp_model_one_relu,
        100 - mean_accuracy_mlp_model_two_tanh,
        100 - mean_accuracy_mlp_model_two_relu,
        100 - mean_accuracy_kmeans_model
    ]
}

results_df = pd.DataFrame(data)
print(results_df)