In [1]:
import os
import gc
import re
import json
import time
import torch
import warnings
import itertools
import numpy as np
import pandas as pd
import torch.nn as nn
from datetime import date
from typing import List, Set
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, recall_score, precision_score


DATE_LIMIT = date(2023, 12, 31)
BASE_PATH = os.path.dirname(os.getcwd())
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

USER_DATA_READ=f"{BASE_PATH}/data/users-details-2023.csv"
USER_DATA_SAVE=f"{BASE_PATH}/data/users.parquet"

ANIME_DATA_READ = f"{BASE_PATH}/data/anime-dataset-2023.csv"
ANIME_DATA_SAVE = f"{BASE_PATH}/data/animes.parquet"

SCORE_DATA_READ = f"{BASE_PATH}/data/users-score-2023.csv"
SCORE_DATA_SAVE = f"{BASE_PATH}/data/scores.parquet"

FINAL_DATASET_CUT6_BASIC_USER_DATA = f"{BASE_PATH}/data/scores-cut6-basic.parquet"
FINAL_DATASET_CUT7_BASIC_USER_DATA = f"{BASE_PATH}/data/scores-cut7-basic.parquet"
FINAL_DATASET_CUT8_BASIC_USER_DATA = f"{BASE_PATH}/data/scores-cut8-basic.parquet"

FINAL_DATASET_CUT6_FULL_USER_DATA = f"{BASE_PATH}/data/scores-cut6-full.parquet"
FINAL_DATASET_CUT7_FULL_USER_DATA = f"{BASE_PATH}/data/scores-cut7-full.parquet"
FINAL_DATASET_CUT8_FULL_USER_DATA = f"{BASE_PATH}/data/scores-cut8-full.parquet"

EXPERIMENT_LOG = f"{BASE_PATH}/data/experiment-log.txt"
RESULTS_DIR = f"{BASE_PATH}/data/results"

warnings.filterwarnings("ignore", category=UserWarning)
torch.backends.cudnn.deterministic = True

In [2]:
class BaseReader:
    def __init__(self, read_path: str, save_path: str):
        self.file_path = read_path
        self.save_path = save_path

    def to_parquet(self, df: pd.DataFrame) -> None:
        df.to_parquet(self.save_path, index=False)

    def get_stats(self, df: pd.DataFrame, columns: List[str]) -> dict:
        result = dict()
        for c in columns:
            result[c] = {
                "hist": df[c].value_counts(dropna=False).to_dict(),
                "max": df[c].max(skipna=True) if df[c].dtype != "O" else 0,
                "mean": df[c].mean(skipna=True) if df[c].dtype != "O" else 0,
                "median": df[c].median(skipna=True) if df[c].dtype != "O" else 0,
                "min": df[c].min(skipna=True) if df[c].dtype != "O" else 0
            }

        return result
    
    def show_stats(self, result: dict) -> None:
        for column in result.keys():
            # Exibe estatísticas descritivas básicas
            print(f"Estatística descritiva de \"{column}\"")
            print(f"Mínimo: {result[column]["min"]}")
            print(f"Média: {result[column]["mean"]}")
            print(f"Mediana: {result[column]["median"]}")
            print(f"Máximo: {result[column]["max"]}")

            # Avalia a quantidade de nulos
            count = 0
            null = 0
            for k in result[column]["hist"].keys():
                count = count + result[column]["hist"][k]
                if type(k) == float and np.isnan(k):
                    null = result[column]["hist"][k]
            percent = round(null * 100 / count, 2) if count > 0 else 0
            print(f"Quantidade de nulos: {null} ({percent}%)")

            # Exibe uma linha de separação
            print("*" * 40, "\n")

In [3]:
class UserReader(BaseReader):
    def __init__(self, read_path: str, save_path: str):
        super().__init__(read_path, save_path)

    def first_process(self) -> pd.DataFrame:
        # Carrega os dados, removendo colunas não utilizadas
        remove_columns = [
            "Username", "Location", "Joined",
            "On Hold", "Plan to Watch", "Rewatched"
        ]
        df = pd.read_csv(self.file_path).drop(remove_columns, axis=1)

        # Faz a troca de gênero definindo Male = 0 e Female = 1
        def clear_gender(value: str) -> int:
            if type(value) != str:
                return None
            return 0 if value.upper() == "MALE" else 1
        df["Gender"] = df["Gender"].apply(clear_gender)

        # Faz a conversão da data de nascimento na idade
        def get_age(birth_date: str | float):
            if type(birth_date) != str:
                return None
            return int((DATE_LIMIT - date.fromisoformat(birth_date.split("T")[0])).days / 365)
        df["age"] = df["Birthday"].apply(get_age)
        df = df.drop(["Birthday"], axis=1)

        # Faz a troca de nomes de colunas
        df = df.rename(columns={
            "Mal ID": "user_id",
            "Gender": "gender",
            "Days Watched": "days_spent_with_anime",
            "Mean Score": "mean_score",
            "Watching": "current_anime_wathing",
            "Completed": "total_anime_watched",
            "Dropped": "dropped_anime",
            "Total Entries": "anime_in_list",
            "Episodes Watched": "episodes_watched"
        })

        # Salva o arquivo limpo
        return df

    def remove_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
        original_rows = len(df)
        df = df.dropna()
        new_rows = len(df)
        percent = round((original_rows - new_rows) * 100 / original_rows, 2)
        print(f"Remoção de {original_rows - new_rows} linhas ({percent}%)")
        return df

In [4]:
def execute_user_analysis():
    reader = UserReader(USER_DATA_READ, USER_DATA_SAVE)
    df_user = reader.first_process()

    stats = reader.get_stats(
        df_user,
        [
            "gender", "days_spent_with_anime", "mean_score",
            "current_anime_wathing", "total_anime_watched",
            "dropped_anime", "anime_in_list", "episodes_watched", "age"
        ]
    )
    reader.show_stats(stats)

    df_user = reader.remove_nulls(df_user)
    reader.to_parquet(df_user)

In [5]:
# execute_user_analysis()
# gc.collect()

In [6]:
class AnimeReader(BaseReader):
    def __init__(self, read_path: str, save_path: str):
        super().__init__(read_path, save_path)

    def first_process(self) -> pd.DataFrame:
        # Carrega dados
        df = pd.read_csv(self.file_path)

        # Remove colunas não utilizadas
        use_columns = ["anime_id", "Genres", "Episodes", "Source", "Duration"]
        df = df[use_columns]

        # Faz a conversão do texto de duração para o valor numérico
        def extract_duration(description: str):
            if description.upper() == "UNKNOWN":
                return np.nan
            numbers = re.findall(r"[0-9]+", description)
            if len(numbers) == 2:
                return int(numbers[0]) * 60 + int(numbers[1])
            else:
                return int(numbers[0])
        df["Duration"] = df["Duration"].apply(extract_duration)

        # Converte o número de episódios em números e remove nulos
        df["Episodes"] = df["Episodes"].apply(lambda x: float(x) if x.upper() != "UNKNOWN" else np.nan).astype("float64")

        # Aplica uma padronização nos nomes dos materiais originais
        def standard_source(source: str):
            conv_source = {
                "4-koma manga": "manga",
                "Book": "book",
                "Card game": "game",
                "Game": "game",
                "Light novel": "novel",
                "Manga": "manga",
                "Mixed media": "other",
                "Music": "other",
                "Novel": "novel",
                "Original": "original",
                "Other": "other",
                "Picture book": "other",
                "Radio": "other",
                "Unknown": np.nan,
                "Visual novel": "visual_novel",
                "Web manga": "manga",
                "Web novel": "novel"
            }
            try:
                return conv_source[source]
            except:
                return np.nan
        df["Source"] = df["Source"].apply(standard_source)

        # Resolve nomenclatura de gêneros
        df["Genres"] = df["Genres"].apply(lambda x: np.nan if x == "UNKNOWN" else x)

        # Faz a troca dos nomes das colunas
        df = df.rename(columns={
            "Genres": "genres",
            "Episodes": "episodes",
            "Source": "source",
            "Duration": "duration"
        })

        return df
    
    def remove_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.dropna(subset=["source", "duration", "episodes"])

In [7]:
def execute_anime_analysis():
    anime_reader = AnimeReader(ANIME_DATA_READ, ANIME_DATA_SAVE)

    df_anime = anime_reader.first_process()
    stats = anime_reader.get_stats(
        df_anime,
        ["genres", "episodes", "source", "duration"]
    )
    anime_reader.show_stats(stats)
    df_anime = anime_reader.remove_nulls(df_anime)
    anime_reader.to_parquet(df_anime)

In [8]:
# execute_anime_analysis()
# gc.collect()

In [9]:
class ScoreReader(BaseReader):
    def __init__(self, read_path: str, save_path: str, user_path: str, anime_path: str):
        super().__init__(read_path, save_path)
        self.anime_path = anime_path
        self.user_path = user_path

    def make_dataset(self, rating_cut=7, user_merge_mode=1) -> pd.DataFrame:
        # Verifica integridade dos parâmetros
        if rating_cut > 10 or rating_cut < 1:
            raise Exception("O corte da classificação deve ser entre 1 e 10")
        
        if user_merge_mode not in [1, 2]:
            raise Exception("O modo de merge de usuário deve ser 1 ou 2")
        
        # Carrega os dados dos scores, limpando as colunas não utilizadas
        df = pd.read_csv(self.file_path)
        df = df.drop(["Username", "Anime Title"], axis=1)

        # Carrega os dados de usuários e animes
        users = pd.read_parquet(self.user_path)
        animes = pd.read_parquet(self.anime_path)

        # Recupera todos os gêneros possíveis
        genres = [[s.strip() for s in g.split(",")] for g in animes["genres"].values if g is not None]
        genres: Set[str] = set(itertools.chain.from_iterable(genres))

        # Define a função de verificação de gênero
        # Os dados de gêneros são carregados como uma string,
        # com as categorias separadas por vírgula
        def verify_genre(genres: str | None, genre: str) -> int:
            if genres is None:
                return 0
            
            genres = [s.lower().strip() for s in genres.split(",")]
            return 1 if genre.lower() in genres else 0

        # Aplica o encoder para gêneros de animes
        for genre in genres:
            column = f"genre_{"_".join(genre.lower().split(" "))}"
            animes[column] = animes["genres"].apply(lambda x: verify_genre(x, genre))
        animes = animes.drop(["genres"], axis=1)

        # Define um encoder para o material original do anime
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        encoder.fit(animes[["source"]])

        # Atualiza os dados de anime com o encoder de material original
        encoder_df = pd.DataFrame(
            encoder.transform(animes[["source"]]),
            columns=encoder.get_feature_names_out()
        )
        animes = pd.concat((animes, encoder_df), axis=1)
        animes = animes.drop(["source"], axis=1)

        # Executa o merge com os dados de usuários
        # user_merge_mode = 1 faz com que apenas os dados básicos sejam usados
        # user_merge_mode = 2 utiliza todos os dados de usuários
        if user_merge_mode == 1:
            users = users[["user_id", "gender", "age"]]

        if user_merge_mode == 2:
            users = users[["user_id", "gender", "age", "days_spent_with_anime", "total_anime_watched", "dropped_anime", "mean_score"]]
        
        df = df.merge(users, how="inner", on="user_id")

        # Executa o merge com os dados de animes
        df = df.merge(animes, how="inner", on="anime_id")

        # Faz a criação da coluna target
        df["target"] = df["rating"].apply(lambda x: 1 if x > rating_cut else 0)
        df = df.drop(["rating"], axis=1)

        # Finaliza o processo, removendo colunas de ID
        df = df.drop(["user_id", "anime_id"], axis=1)
        return df

In [10]:
def create_datasets():
    result_files = [
        FINAL_DATASET_CUT6_BASIC_USER_DATA,
        FINAL_DATASET_CUT7_BASIC_USER_DATA,
        FINAL_DATASET_CUT8_BASIC_USER_DATA,
        FINAL_DATASET_CUT6_FULL_USER_DATA,
        FINAL_DATASET_CUT7_FULL_USER_DATA,
        FINAL_DATASET_CUT8_FULL_USER_DATA
    ]

    for save_path in result_files:
        score_reader = ScoreReader(
            SCORE_DATA_READ,
            save_path,
            USER_DATA_SAVE,
            ANIME_DATA_SAVE
        )
        scores = score_reader.make_dataset()
        score_reader.to_parquet(scores)

        del scores
        gc.collect()

In [11]:
# create_datasets()
# gc.collect()

# Modelo

In [12]:
class Model4Layers(nn.Module):
    def __init__(self, n_features: int, n_classes=2, n_neurons=16):
        super().__init__()
        self.fc1 = nn.Linear(n_features, n_neurons)
        self.fc2 = nn.Linear(n_neurons, n_neurons)
        self.fc3 = nn.Linear(n_neurons, n_neurons)
        self.fc4 = nn.Linear(n_neurons, n_classes)
        self.activation = nn.ReLU()
        self.out = nn.Softmax()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.activation(x)
        x = self.fc3(x)
        x = self.activation(x)
        x = self.fc4(x)
        x = self.out(x)
        return x
    

class Model8Layers(nn.Module):
    def __init__(self, n_features: int, n_classes=2, n_neurons=16):
        super().__init__()
        self.fc1 = nn.Linear(n_features, n_neurons)
        self.fc2 = nn.Linear(n_neurons, n_neurons)
        self.fc3 = nn.Linear(n_neurons, n_neurons)
        self.fc4 = nn.Linear(n_neurons, n_neurons)
        self.fc5 = nn.Linear(n_neurons, n_neurons)
        self.fc6 = nn.Linear(n_neurons, n_neurons)
        self.fc7 = nn.Linear(n_neurons, n_neurons)
        self.fc8 = nn.Linear(n_neurons, n_classes)
        self.activation = nn.ReLU()
        self.out = nn.Softmax()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.activation(x)
        x = self.fc3(x)
        x = self.activation(x)
        x = self.fc4(x)
        x = self.activation(x)
        x = self.fc5(x)
        x = self.activation(x)
        x = self.fc6(x)
        x = self.activation(x)
        x = self.fc7(x)
        x = self.activation(x)
        x = self.fc8(x)
        x = self.out(x)
        return x

In [13]:
class Manager:
    def __init__(self):
        pass

    def get_dataset(self, read_path: str, sample=0.2):
        # Carrega dados processados
        df = pd.read_parquet(read_path).sample(frac=sample)
        X = df.drop(["target"], axis=1).values
        y = df["target"].values

        # Faz a divisão entre treino e teste
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Aplica a padronização de valores
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Faz a transformação de numpy array para tensor
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long)

        # Instancia dataset de tensores
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

        # Instancia loader de tensores
        train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

        return train_dataset, test_dataset, train_loader, test_loader
    
    def execute(self, train_dataset: DataLoader, train_loader: DataLoader, epochs=100, n_neurons=16, arch=1):
        # Garante consistência da arquitetura
        if arch not in [1, 2]:
            raise Exception("As arquiteturas válidas são 1 e 2")
        
        # Regitra o tempo de início do treinamento
        start_time = time.time()

        # Carrega dados e inicializa modelo
        classes_ = len(train_dataset.tensors[1].unique())
        if arch == 1:
            model = Model4Layers(
                n_features=train_dataset.tensors[0].shape[1],
                n_classes=classes_,
                n_neurons=n_neurons
            )
        elif arch == 2:
            model = Model8Layers(
                n_features=train_dataset.tensors[0].shape[1],
                n_classes=classes_,
                n_neurons=n_neurons
            )

        # Define o modo de otimização
        optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(0, epochs):
            running_loss = 0.0
            running_corrects = 0

            model.train()
            count_batch = 0
            limit_batch = (train_dataset.tensors[0].shape[0] // train_loader.batch_size) + 1

            for inputs, labels in train_loader:
                percent = round(count_batch * 100 / limit_batch, 2)
                print(f"Epoch {epoch + 1} Batch {count_batch + 1} ({percent}%)", end="\r")
                inputs = inputs
                labels = labels

                # Inicia os gradientes e calcula a predição
                optimizer.zero_grad()
                outputs = model(inputs)
                pred_labels = torch.argmax(outputs, dim=1)

                # Teoricamente, seria preciso passar as labels do dataset para o
                # padrão one hot encoder, porém a camada softmax no modelo já
                # resolve isso.
                # oh_labels = F.one_hot(labels.long())
                # loss = criterion(outputs, torch.reshape(oh_labels, (oh_labels.size()[0], classes_)).float())
                loss = criterion(outputs, labels)

                # Calcula os gradientes e atualiza os pesos
                loss.backward()
                optimizer.step()

                # Fal a atualização das estatísticas de acompanhamento
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(pred_labels == labels.data).item()
                count_batch = count_batch + 1

            # Exibe estatísticas de acompanhamento
            num_samples = len(train_dataset)
            epoch_loss = running_loss / num_samples
            epoch_accuracy = running_corrects / num_samples
            print(f"Epoch {epoch + 1}: Loss {epoch_loss:.3f} Acurácia {epoch_accuracy:.3f}")

        # Calcula o tempo de execução do treinamento
        end_time = time.time()
        train_time = end_time - start_time

        return model, train_time, epoch_loss
    
    def compute_test(self, model: nn.Module, test_loader: DataLoader, train_time: float, train_loss: float) -> dict:
        # Define modelo como avaliação e inicia as listas de labels
        model.eval()
        pred_labels_all = []
        true_labels_all = []

        # Passa pelo loader para cálculo das predições
        for inputs, labels in test_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            pred_labels = torch.argmax(outputs, dim=1)
            pred_labels_all.append(pred_labels)
            true_labels_all.append(labels)

        # Concatena os resultados
        pred_labels = torch.cat(pred_labels_all, dim=0).cpu().numpy()
        true_labels = torch.cat(true_labels_all, dim=0).numpy()

        # Registra dados no dicionário de dados
        return {
            "train_time": train_time,
            "train_loss": train_loss,
            "metrics": {
                "accuracy": (pred_labels == true_labels).mean(),
                "f1-score": f1_score(true_labels, pred_labels, pos_label=1, average="binary"),
                "recall-score": recall_score(true_labels, pred_labels, pos_label=1, average="binary"),
                "precission-score": precision_score(true_labels, pred_labels, pos_label=1, average="binary")
            }
        }

In [14]:
def execute_experiments():
    # Define os parâmetros dos experimentos
    archs_set = [1, 2]
    neurons_set = [16]
    sample_data = [0.1, 0.2]
    epochs = [10]
    repeat = 5
    data_paths = [
        FINAL_DATASET_CUT6_BASIC_USER_DATA,
        FINAL_DATASET_CUT7_BASIC_USER_DATA,
        FINAL_DATASET_CUT8_BASIC_USER_DATA,
        FINAL_DATASET_CUT6_FULL_USER_DATA,
        FINAL_DATASET_CUT7_FULL_USER_DATA,
        FINAL_DATASET_CUT8_FULL_USER_DATA
    ]

    # Verifica o log de experimentos
    if not os.path.exists(EXPERIMENT_LOG):
        with open(EXPERIMENT_LOG, "w") as file:
            file.write("dataset_type,arch,neurons,sample,epochs,iteration,weight_file,predict_file\n")

    # Função auxiliar: abre o log e verifica registros
    def verify(dataset_type: str, arch: int, neurons: int, sample: float, epochs: int, iteration: int):
        exist = False
        with open(EXPERIMENT_LOG, "r") as file:
            row = file.readline()
            while row:
                row_dataset_type, row_arch, row_neurons, row_sample, row_epochs, row_iteration, _, _ = row.split(",")
                row_params = [row_dataset_type, row_arch, row_neurons, row_sample, row_epochs, row_iteration]
                search_params = [str(dataset_type), str(arch), str(neurons), str(sample), str(epochs), str(iteration)]
                #print(row_params, search_params)
                
                if row_params == search_params:
                    exist = True
                    break

                row = file.readline()

        return exist

    for data_path in data_paths:
        # Tipo de dataset utilizado
        dataset_type = data_path.split("/")[-1].split(".")[0]

        for neurons in neurons_set:
            # Quantidade de neurônios das camadas internas

            for sample in sample_data:
                # Porção dos dados fracionados

                for epoch in epochs:
                    # Quantidade de épocas do treinamento

                    for arch in archs_set:
                        # Profundidade da rede

                        for i in range(0, repeat):
                            # Verifica se o experimento já foi executado
                            if verify(dataset_type, arch, neurons, sample, epoch, i):
                                continue

                            # Registra todos os dados do experimento
                            unique_name = int(time.time())
                            weight_path = f"{RESULTS_DIR}/{unique_name}.pth"
                            predict_path = f"{RESULTS_DIR}/{unique_name}.json"
                            experiment_data = [
                                    dataset_type,
                                    str(arch),
                                    str(neurons),
                                    str(sample),
                                    str(epoch),
                                    str(i),
                                    f"{unique_name}.pth",
                                    f"{unique_name}.json"
                                ]

                            # Log de execução
                            print(f"Execução do experimento {",".join(experiment_data[:-2])}".upper())

                            # Repetição do experimento
                            process = Manager()
                            train_dataset, test_dataset, train_loader, test_loader = process.get_dataset(data_path, sample=sample)
                            gc.collect()
                            model, train_time, train_loss = process.execute(train_dataset, train_loader, epochs=epoch, n_neurons=neurons, arch=arch)

                            # Executa o teste do modelo
                            results = process.compute_test(model, test_loader, train_time, train_loss)
                            
                            # Salva o json de métricas
                            with open(predict_path, "w+") as file:
                                file.write(json.dumps(results))

                            # Salva os pesos do modelo
                            torch.save(model.state_dict(), weight_path)

                            # Registra no log
                            with open(EXPERIMENT_LOG, "a") as file:
                                file.write(",".join(experiment_data) + "\n")

                            # Libera memória
                            del train_dataset, test_dataset, train_loader, test_loader
                            gc.collect()
                            print()

In [15]:
# execute_experiments()

# Análise

In [16]:
df = pd.read_csv(EXPERIMENT_LOG)
df

Unnamed: 0,dataset_type,arch,neurons,sample,epochs,iteration,weight_file,predict_file
0,scores-cut6-basic,1,16,0.1,10,0,1731500580.pth,1731500580.json
1,scores-cut6-basic,1,16,0.1,10,1,1731500656.pth,1731500656.json
2,scores-cut6-basic,1,16,0.1,10,2,1731500728.pth,1731500728.json
3,scores-cut6-basic,1,16,0.1,10,3,1731500799.pth,1731500799.json
4,scores-cut6-basic,1,16,0.1,10,4,1731500872.pth,1731500872.json
...,...,...,...,...,...,...,...,...
115,scores-cut8-full,2,16,0.2,10,0,1731513051.pth,1731513051.json
116,scores-cut8-full,2,16,0.2,10,1,1731513205.pth,1731513205.json
117,scores-cut8-full,2,16,0.2,10,2,1731513355.pth,1731513355.json
118,scores-cut8-full,2,16,0.2,10,3,1731513505.pth,1731513505.json


Primeira comparação: existe diferença entre utilizar dados básicos e metadados?

In [34]:
# Descreve fatores de análise
factors: Set[str] = set()
for dataset_type in df["dataset_type"].unique():
    for arch in df["arch"].unique():
        for sample in df["sample"].unique():
            dt = dataset_type.split("-")[1]
            factors.add(f"{dt}-{sample}-{arch}")

# Calcula as médias para cada fator
metrics = []
for factor in factors:
    dt, sample, arch = factor.split("-")
    basic = f"scores-{dt}-basic"
    full = f"scores-{dt}-full"

    metric = {
        "factor": factor,
        "basic_f1": [],
        "full_f1": []
    }

    subset = df.loc[
        (df["sample"] == float(sample))
        & (df["arch"] == int(arch))
    ]

    for _, row in subset.iterrows():
        with open(f"{RESULTS_DIR}/{row["predict_file"]}") as file:
            result = json.load(file)
            
        if row["dataset_type"] == basic:
            metric["basic_f1"].append(result["metrics"]["f1-score"])

        elif row["dataset_type"] == full:
            metric["full_f1"].append(result["metrics"]["f1-score"])

    metric["basic_f1"] = np.mean(metric["basic_f1"])
    metric["full_f1"] = np.mean(metric["full_f1"])
    metric["(full-basic) %"] = (metric["full_f1"] - metric["basic_f1"]) * 100 / metric["basic_f1"]

    metrics.append(metric)

metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,factor,basic_f1,full_f1,(full-basic) %
0,cut8-0.2-2,0.70796,0.70554,-0.341773
1,cut7-0.1-1,0.70668,0.704263,-0.342012
2,cut7-0.2-1,0.706703,0.704651,-0.290471
3,cut8-0.1-2,0.70742,0.703235,-0.591593
4,cut6-0.1-2,0.707292,0.700698,-0.932328
5,cut7-0.2-2,0.70521,0.709543,0.61446
6,cut8-0.2-1,0.707439,0.707936,0.070331
7,cut6-0.1-1,0.703058,0.709564,0.925313
8,cut6-0.2-1,0.70705,0.704225,-0.399524
9,cut7-0.1-2,0.700951,0.705855,0.699642


Segunda comparação: existe diferença de onde fazer o corte de gostou/desgostou?

In [38]:
# Descreve fatores de análise
factors: Set[str] = set()
for dataset_type in df["dataset_type"].unique():
    for arch in df["arch"].unique():
        for sample in df["sample"].unique():
            dt = dataset_type.split("-")[2]
            factors.add(f"{dt}-{sample}-{arch}")

# Calcula as médias para cada fator
metrics = []
for factor in factors:
    dt, sample, arch = factor.split("-")
    cut6 = f"scores-cut6-{dt}"
    cut7 = f"scores-cut7-{dt}"
    cut8 = f"scores-cut8-{dt}"

    metric = {
        "factor": factor,
        "cut6": [],
        "cut7": [],
        "cut8": []
    }

    subset = df.loc[
        (df["sample"] == float(sample))
        & (df["arch"] == int(arch))
    ]

    for _, row in subset.iterrows():
        with open(f"{RESULTS_DIR}/{row["predict_file"]}") as file:
            result = json.load(file)
            
        if row["dataset_type"] == cut6:
            metric["cut6"].append(result["metrics"]["f1-score"])

        if row["dataset_type"] == cut7:
            metric["cut7"].append(result["metrics"]["f1-score"])

        if row["dataset_type"] == cut8:
            metric["cut8"].append(result["metrics"]["f1-score"])

    metric["cut6"] = np.mean(metric["cut6"])
    metric["cut7"] = np.mean(metric["cut7"])
    metric["cut8"] = np.mean(metric["cut8"])

    metrics.append(metric)

metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,factor,cut6,cut7,cut8
0,full-0.1-2,0.700698,0.705855,0.703235
1,basic-0.1-2,0.707292,0.700951,0.70742
2,basic-0.2-2,0.702501,0.70521,0.70796
3,basic-0.2-1,0.70705,0.706703,0.707439
4,full-0.2-2,0.706176,0.709543,0.70554
5,full-0.2-1,0.704225,0.704651,0.707936
6,basic-0.1-1,0.703058,0.70668,0.703752
7,full-0.1-1,0.709564,0.704263,0.708675


Terceira comparação: existe diferença entre abordagens com diferentes profundidades?

In [43]:
# Descreve fatores de análise
factors: Set[str] = set()
for dataset_type in df["dataset_type"].unique():
    for sample in df["sample"].unique():
        factors.add(f"{dataset_type}/{sample}")

# Calcula as médias para cada fator
metrics = []
for factor in factors:
    dt, sample = factor.split("/")
    arch1, arch2 = 1, 2

    metric = {
        "factor": factor,
        "arch1": [],
        "arch2": []
    }

    subset = df.loc[
        (df["sample"] == float(sample))
        & (df["dataset_type"] == dt)
    ]

    for _, row in subset.iterrows():
        with open(f"{RESULTS_DIR}/{row["predict_file"]}") as file:
            result = json.load(file)
            
        if row["arch"] == arch1:
            metric["arch1"].append(result["metrics"]["f1-score"])

        if row["arch"] == arch2:
            metric["arch2"].append(result["metrics"]["f1-score"])

    metric["arch1"] = np.mean(metric["arch1"])
    metric["arch2"] = np.mean(metric["arch2"])
    metric["(arch2-arch1) %"] = (metric["arch2"] - metric["arch1"]) * 100 / metric["arch1"]

    metrics.append(metric)

metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,factor,arch1,arch2,(arch2-arch1) %
0,scores-cut6-basic/0.1,0.703058,0.707292,0.602211
1,scores-cut7-basic/0.1,0.70668,0.700951,-0.81068
2,scores-cut8-basic/0.2,0.707439,0.70796,0.07363
3,scores-cut6-full/0.1,0.709564,0.700698,-1.249483
4,scores-cut8-basic/0.1,0.703752,0.70742,0.52122
5,scores-cut8-full/0.1,0.708675,0.703235,-0.767709
6,scores-cut7-basic/0.2,0.706703,0.70521,-0.211295
7,scores-cut7-full/0.2,0.704651,0.709543,0.694354
8,scores-cut6-full/0.2,0.704225,0.706176,0.277121
9,scores-cut6-basic/0.2,0.70705,0.702501,-0.643364


Quarta comparação: o sample de dados interfere na capacidade do modelo?

In [45]:
# Descreve fatores de análise
factors: Set[str] = set()
for dataset_type in df["dataset_type"].unique():
    for arch in df["arch"].unique():
        factors.add(f"{dataset_type}/{arch}")

# Calcula as médias para cada fator
metrics = []
for factor in factors:
    dt, arch = factor.split("/")
    sampe01, sample02 = 0.1, 0.2

    metric = {
        "factor": factor,
        "sample01": [],
        "sample02": []
    }

    subset = df.loc[
        (df["dataset_type"] == dt)
        & (df["arch"] == int(arch))
    ]

    for _, row in subset.iterrows():
        with open(f"{RESULTS_DIR}/{row["predict_file"]}") as file:
            result = json.load(file)
            
        if row["sample"] == sampe01:
            metric["sample01"].append(result["metrics"]["f1-score"])

        if row["sample"] == sample02:
            metric["sample02"].append(result["metrics"]["f1-score"])

    metric["sample01"] = np.mean(metric["sample01"])
    metric["sample02"] = np.mean(metric["sample02"])

    metrics.append(metric)

metrics = pd.DataFrame(metrics)
metrics

Unnamed: 0,factor,sample01,sample02
0,scores-cut8-basic/1,0.703752,0.707439
1,scores-cut6-full/1,0.709564,0.704225
2,scores-cut6-full/2,0.700698,0.706176
3,scores-cut8-full/1,0.708675,0.707936
4,scores-cut8-full/2,0.703235,0.70554
5,scores-cut7-full/1,0.704263,0.704651
6,scores-cut7-basic/2,0.700951,0.70521
7,scores-cut7-full/2,0.705855,0.709543
8,scores-cut8-basic/2,0.70742,0.70796
9,scores-cut7-basic/1,0.70668,0.706703
