# Dataset:  [Social Media Usage and Emotional Well-Being](https://www.kaggle.com/datasets/emirhanai/social-media-usage-and-emotional-well-being)
- Tópicos Especiais - Sistemas para Internet - IFPB
- Data: 09/03/2025
- Análise por Allan Alves Amâncio
- IDE utilizada: Google Colab

Target: *Dominant_Emotion*

import kagglehub
import os
import numpy as np
import pandas as pd
from sklearn import metrics, tree
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler

path = "/root/.cache/kagglehub/datasets/emirhanai/social-media-usage-and-emotional-well-being/versions/1"
file_path = os.path.join(path, "train.csv")

if not os.path.exists(file_path):
    print("Fazendo download do dataset...")
    path = kagglehub.dataset_download("emirhanai/social-media-usage-and-emotional-well-being")
else:
    print("Dataset já baixado. Continuando...")

df = pd.read_csv(file_path)

df.drop(columns=['User_ID', 'Platform'], inplace=True, axis=1)

df = df[df["Gender"].isin(["Male", "Female", "Non-binary"])]

label_encoder = LabelEncoder()
df["Dominant_Emotion"] = label_encoder.fit_transform(df["Dominant_Emotion"])

column_transformer_one_hot = make_column_transformer(
    (OneHotEncoder(), ["Gender"]),
    remainder='passthrough'
)

numeric_cols = df.columns.difference(["Dominant_Emotion", "Gender"])
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df_transformed = column_transformer_one_hot.fit_transform(df)
new_column_names = column_transformer_one_hot.get_feature_names_out()
df = pd.DataFrame(df_transformed, columns=new_column_names)

dominant_emotion_col = [col for col in df.columns if "remainder__Dominant_Emotion" in col][0]
df[dominant_emotion_col] = df[dominant_emotion_col].astype(int)

dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

y = df[dominant_emotion_col]
X = df.drop(columns=[dominant_emotion_col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

print("\nÁrvores de Decisão (Gini e Entropy):")

tree_model_gini = tree.DecisionTreeClassifier(criterion="gini")

kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(tree_model_gini, X, y, cv=kf, scoring='accuracy')

mean_accuracy = np.mean(scores) * 100

print("\nMédia de acurácia (10 folds) com Gini: {:.2f}%".format(mean_accuracy))