# Dataset:  [Social Media Usage and Emotional Well-Being](https://www.kaggle.com/datasets/emirhanai/social-media-usage-and-emotional-well-being)
- Tópicos Especiais - Sistemas para Internet - IFPB
- Data: 10/03/2025
- Análise por Allan Alves Amâncio (feita no Google Colab)

Target: *Dominant_Emotion*

In [None]:
import kagglehub
import os
import numpy as np
import pandas as pd
from sklearn import metrics, tree
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
import time

path = "/root/.cache/kagglehub/datasets/emirhanai/social-media-usage-and-emotional-well-being/versions/1"
file_path = os.path.join(path, "train.csv")

if not os.path.exists(file_path):
    print("Fazendo download do dataset...")
    path = kagglehub.dataset_download("emirhanai/social-media-usage-and-emotional-well-being")
else:
    print("Dataset já baixado. Continuando...")

df = pd.read_csv(file_path)

df.drop(columns=['User_ID', 'Platform'], inplace=True, axis=1)

df = df[df["Gender"].isin(["Male", "Female", "Non-binary"])]

label_encoder = LabelEncoder()
df["Dominant_Emotion"] = label_encoder.fit_transform(df["Dominant_Emotion"])

column_transformer_one_hot = make_column_transformer(
    (OneHotEncoder(), ["Gender"]),
    remainder='passthrough'
)

numeric_cols = df.columns.difference(["Dominant_Emotion", "Gender"])
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df_transformed = column_transformer_one_hot.fit_transform(df)
new_column_names = column_transformer_one_hot.get_feature_names_out()
df = pd.DataFrame(df_transformed, columns=new_column_names)

dominant_emotion_col = [col for col in df.columns if "remainder__Dominant_Emotion" in col][0]
df[dominant_emotion_col] = df[dominant_emotion_col].astype(int)

dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

y = df[dominant_emotion_col]
X = df.drop(columns=[dominant_emotion_col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

print("\nALGORITMOS A EXECUTAR E COMPARAR")

print("\n1. Árvores de Decisão (Gini e Entropy):")

tree_model_gini = tree.DecisionTreeClassifier(criterion="gini")
tree_model_entropy = tree.DecisionTreeClassifier(criterion="entropy")

kf_tree_model_gini = KFold(n_splits=10, shuffle=True, random_state=42)
kf_tree_model_entropy = KFold(n_splits=10, shuffle=True, random_state=42)

scores_tree_model_gini = cross_val_score(tree_model_gini, X, y, cv=kf_tree_model_gini, scoring='accuracy')
scores_tree_model_entropy = cross_val_score(tree_model_entropy, X, y, cv=kf_tree_model_entropy, scoring='accuracy')

mean_accuracy_tree_model_gini = np.mean(scores_tree_model_gini) * 100
mean_accuracy_tree_model_entropy = np.mean(scores_tree_model_entropy) * 100

print("\nMédia de acurácia (10 folds) com Gini: {:.2f}%".format(mean_accuracy_tree_model_gini))
print("\nMédia de acurácia (10 folds) com Entropy: {:.2f}%".format(mean_accuracy_tree_model_entropy))

print("\n2. kNN (k igual a 5 e 10):")

knn_model_five_neighbors = KNeighborsClassifier(n_neighbors=5, metric='euclidean', algorithm='brute')
knn_model_ten_neighbors = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='brute')

kf_knn_model_five_neighbors = KFold(n_splits=10, shuffle=True, random_state=42)
kf_knn_model_ten_neighbors = KFold(n_splits=10, shuffle=True, random_state=42)

scores_knn_model_five_neighbors = cross_val_score(knn_model_five_neighbors, X, y, cv=kf_knn_model_five_neighbors, scoring='accuracy')
scores_knn_model_ten_neighbors = cross_val_score(knn_model_ten_neighbors, X, y, cv=kf_knn_model_ten_neighbors, scoring='accuracy')

mean_accuracy_knn_model_five_neighbors = np.mean(scores_knn_model_five_neighbors) * 100
mean_accuracy_knn_model_ten_neighbors = np.mean(scores_knn_model_ten_neighbors) * 100

print("\nMédia de acurácia (10 folds) com kNN (k = 5 vizinhos): {:.2f}%".format(mean_accuracy_knn_model_five_neighbors))
print("\nMédia de acurácia (10 folds) com kNN (k = 10 vizinhos): {:.2f}%".format(mean_accuracy_knn_model_ten_neighbors))

print("\n3. MLP com duas arquiteturas diferentes (e com funções de ativação 'Tanh' e 'ReLU' em cada arquitetura):")

mlp_model_one_tanh = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='tanh', max_iter=1200, early_stopping=True, alpha=0.001, learning_rate_init=0.03)
mlp_model_one_relu = MLPClassifier(hidden_layer_sizes=(64,32,16), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.028)

mlp_model_two_tanh = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='tanh', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)
mlp_model_two_relu = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='relu', max_iter=1500, early_stopping=True, alpha=0.001, learning_rate_init=0.015)

kf_mlp_model_one_tanh = KFold(n_splits=10, shuffle=True, random_state=42)
kf_mlp_model_one_relu = KFold(n_splits=10, shuffle=True, random_state=42)
kf_mlp_model_two_tanh = KFold(n_splits=10, shuffle=True, random_state=42)
kf_mlp_model_two_relu = KFold(n_splits=10, shuffle=True, random_state=42)

start_time = time.time()
scores_mlp_model_one_tanh = cross_val_score(mlp_model_one_tanh, X, y, cv=kf_mlp_model_one_tanh, scoring='accuracy')
elapsed_time_one_tanh = time.time() - start_time
mean_accuracy_mlp_model_one_tanh = np.mean(scores_mlp_model_one_tanh) * 100
print("\nMédia de acurácia (10 folds) com MLP 1 (tanh): {:.2f}% (Tempo: {:.2f}s)".format(mean_accuracy_mlp_model_one_tanh, elapsed_time_one_tanh))

start_time = time.time()
scores_mlp_model_one_relu = cross_val_score(mlp_model_one_relu, X, y, cv=kf_mlp_model_one_relu, scoring='accuracy')
elapsed_time_one_relu = time.time() - start_time
mean_accuracy_mlp_model_one_relu = np.mean(scores_mlp_model_one_relu) * 100
print("\nMédia de acurácia (10 folds) com MLP 1 (relu): {:.2f}% (Tempo: {:.2f}s)".format(mean_accuracy_mlp_model_one_relu, elapsed_time_one_relu))

start_time = time.time()
scores_mlp_model_two_tanh = cross_val_score(mlp_model_two_tanh, X, y, cv=kf_mlp_model_two_tanh, scoring='accuracy')
elapsed_time_two_tanh = time.time() - start_time
mean_accuracy_mlp_model_two_tanh = np.mean(scores_mlp_model_two_tanh) * 100
print("\nMédia de acurácia (10 folds) com MLP 2 (tanh): {:.2f}% (Tempo: {:.2f}s)".format(mean_accuracy_mlp_model_two_tanh, elapsed_time_two_tanh))

start_time = time.time()
scores_mlp_model_two_relu = cross_val_score(mlp_model_two_relu, X, y, cv=kf_mlp_model_two_relu, scoring='accuracy')
elapsed_time_two_relu = time.time() - start_time
mean_accuracy_mlp_model_two_relu = np.mean(scores_mlp_model_two_relu) * 100
print("\nMédia de acurácia (10 folds) com MLP 2 (relu): {:.2f}% (Tempo: {:.2f}s)".format(mean_accuracy_mlp_model_two_relu, elapsed_time_two_relu))

Dataset já baixado. Continuando...

ALGORITMOS A EXECUTAR E COMPARAR

1. Árvores de Decisão (Gini e Entropy):

Média de acurácia (10 folds) com Gini: 96.54%

Média de acurácia (10 folds) com Entropy: 96.32%

2. kNN (k igual a 5 e 10):

Média de acurácia (10 folds) com kNN (k = 5 vizinhos): 99.14%

Média de acurácia (10 folds) com kNN (k = 10 vizinhos): 95.24%
