In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import fs

In [2]:
INTERIM_FILES_DIR = fs.open_fs("MCD-NLP-HPT/data/interim")
INTERIM_COMMENTS_CLEANED_ONLY_SW_DIR = INTERIM_FILES_DIR.getsyspath("Comments_cleaned_only_sw.csv")

PROCESSED_FILES_DIR = fs.open_fs("MCD-NLP-HPT/data/processed")
TRAIN_DIR = PROCESSED_FILES_DIR.getsyspath("train_only_sw.csv")
TEST_DIR = PROCESSED_FILES_DIR.getsyspath("test_only_sw.csv")
VAL_DIR = PROCESSED_FILES_DIR.getsyspath("validation_only_sw.csv")

X_TRAIN_DIR = PROCESSED_FILES_DIR.getsyspath("x_train_only_sw.csv")
X_TEST_DIR = PROCESSED_FILES_DIR.getsyspath("x_test_only_sw.csv")
X_VAL_DIR = PROCESSED_FILES_DIR.getsyspath("x_val_only_sw.csv")

Y_TRAIN_DIR = PROCESSED_FILES_DIR.getsyspath("y_train_only_sw.csv")
Y_TEST_DIR = PROCESSED_FILES_DIR.getsyspath("y_test_only_sw.csv")
Y_VAL_DIR = PROCESSED_FILES_DIR.getsyspath("y_val_only_sw.csv")

In [3]:
df = pd.read_csv(INTERIM_COMMENTS_CLEANED_ONLY_SW_DIR)

In [4]:
df['category_id'] = df['category_id'].astype('category')
# df['category_description'] = df['category_description'].astype('category')

In [5]:
df

Unnamed: 0,id,videoId,textOriginal,authorDisplayName,likeCount,publishedAt,category_id,category_description,Tokens,Tokens_without_stopwords
0,Ugz0QmXyVLSM0nUK7kB4AaABAg,cQ9RLDhq6JY,me gusta su episodio pero solo he escuchado lo...,@fabianbecerra2272,0,2024-07-15T12:41:43Z,1,Quejas o sugerencias de mejora,"['me', 'gusta', 'su', 'episodio', 'pero', 'sol...","['episodio', 'escuchado', 'youtube', 'spotify'..."
1,UgyJhRBhVcbyStZdxjR4AaABAg,cQ9RLDhq6JY,weeeeey,@SantiagoItzcoatl,0,2024-05-26T21:41:04Z,7,Comentarios generales,['weeeeey'],[]
2,UgzT1q2am7kMEM5ckDp4AaABAg,cQ9RLDhq6JY,les voy a enviar una foto de tarapaca,@Sonyeke_1,0,2024-05-23T22:06:21Z,7,Comentarios generales,"['les', 'voy', 'a', 'enviar', 'una', 'foto', '...","['foto', 'tarapaca']"
3,Ugyckucg38lqZFdZhaR4AaABAg,cQ9RLDhq6JY,entonces teca esta solterooooooooooooooo woooooo,@alejandranavarro464,0,2024-05-05T09:24:57Z,6,Comentarios humorísticos o memes,"['entonces', 'teca', 'esta', 'solteroooooooooo...",['teca']
4,UgzhZHVt-tEbDpbCvtB4AaABAg,cQ9RLDhq6JY,me encantan sus episodio los escucho en spotif...,@fiorellamartinez873,0,2024-04-21T21:09:08Z,5,Felicitaciones y agradecimientos,"['me', 'encantan', 'sus', 'episodio', 'los', '...","['encantan', 'episodio', 'spotify', 'viajo', '..."
...,...,...,...,...,...,...,...,...,...,...
1021,Ugyuq-tSoKFj7HEnAQ54AaABAg,t3MogAmkMUg,justo a tiempo para hacerme pndja en el trabaj...,@estefaniasandoval9999,0,2024-08-19T19:46:46Z,6,Comentarios humorísticos o memes,"['justo', 'a', 'tiempo', 'para', 'hacerme', 'p...","['tiempo', 'bien', 'invitado']"
1022,Ugy2h_vD89gf_8eeFph4AaABAg,t3MogAmkMUg,ya que son otakus declarados les recomiendo el...,@medinamartinezdavid8052,0,2024-08-19T19:33:41Z,2,Propuestas y recomendaciones,"['ya', 'que', 'son', 'otakus', 'declarados', '...","['anime', 'anime', 'bien', 'episodio']"
1023,Ugx7vBivUc-z-n8H76t4AaABAg,t3MogAmkMUg,pero dejen hablar al invitado chavos banda d,@hoffnerbass,1,2024-08-19T19:33:18Z,1,Quejas o sugerencias de mejora,"['pero', 'dejen', 'hablar', 'al', 'invitado', ...","['invitado', 'chavos', 'banda']"
1024,Ugzvu7xFsIUo8ibNXPJ4AaABAg,t3MogAmkMUg,el primer metal no seria el na cuando el human...,@LuisAndresOtalvaroSanchez,0,2024-08-19T19:13:00Z,4,Correcciones o datos adicionales,"['el', 'primer', 'metal', 'no', 'seria', 'el',...","['metal', 'sal']"


# Separación de datos de prueba y de entrenamiento

In [6]:
train_df, test_df = train_test_split(df, test_size=0.1,random_state=1)
test_df, val_df = train_test_split(test_df, test_size=0.1, random_state=1)

In [7]:
def calculate_category_percentages(df):
    """
    Calculate the percentage of each category in all categorical columns of a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        
    Returns:
        pd.DataFrame: A DataFrame containing columns, categories, and their percentages.
    """
    per_data = []

    # Iterate over columns with categorical dtype
    for col in df.select_dtypes(include=['category']):
        # Calculate value counts as percentages
        percentages = df[col].value_counts(normalize=True) * 100
        # Append data for each category
        for category, percentage in percentages.items():
            per_data.append({'Column': col, 'Category': category, 'Percentage': percentage})

    # Return the result as a DataFrame
    return pd.DataFrame(per_data)


In [8]:
def generate_report_for_datasets(dataset_dict):
    """
    Generate a report of category percentages for multiple datasets.
    
    Parameters:
        dataset_dict (dict): A dictionary where keys are dataset names and values are DataFrames.
        
    Returns:
        dict: A dictionary where keys are dataset names and values are DataFrames with percentages.
    """
    reports = {}
    for name, df in dataset_dict.items():
        reports[name] = calculate_category_percentages(df)
    return reports

In [9]:
datasets = {
    "Train Data": train_df,
    "Validation Data": val_df,
    "Test Data": test_df
}

# Generate reports
reports = generate_report_for_datasets(datasets)

# Display results
for dataset_name, report in reports.items():
    print(f"--- {dataset_name} ---")
    print(report)

--- Train Data ---
        Column  Category  Percentage
0  category_id         5   22.210184
1  category_id         6   21.560130
2  category_id         2   16.684724
3  category_id         4   12.567714
4  category_id         7   11.159263
5  category_id         1    9.100758
6  category_id         3    6.717226
--- Validation Data ---
        Column  Category  Percentage
0  category_id         2   36.363636
1  category_id         6   27.272727
2  category_id         7   27.272727
3  category_id         4    9.090909
4  category_id         1    0.000000
5  category_id         3    0.000000
6  category_id         5    0.000000
--- Test Data ---
        Column  Category  Percentage
0  category_id         6   25.000000
1  category_id         2   21.739130
2  category_id         5   14.130435
3  category_id         7   14.130435
4  category_id         4   13.043478
5  category_id         1    6.521739
6  category_id         3    5.434783


In [10]:
train_df.to_csv(TRAIN_DIR, index=False, encoding="utf-8")
test_df.to_csv(TEST_DIR, index=False, encoding="utf-8")
val_df.to_csv(VAL_DIR, index=False, encoding="utf-8")

# Spliting the dataset into train, test and validation set

In [11]:
x_train_df = train_df.drop(['category_id'], axis=1)
y_train_df = train_df['category_id']

x_test_df = test_df.drop(['category_id'], axis=1)
y_test_df = test_df['category_id']

x_val_df = val_df.drop(['category_id'], axis=1)
y_val_df = val_df['category_id']

In [12]:
x_train_df.to_csv(X_TRAIN_DIR, index=False, encoding="utf-8")
x_test_df.to_csv(X_TEST_DIR, index=False, encoding="utf-8")
x_val_df.to_csv(X_VAL_DIR, index=False, encoding="utf-8")

y_train_df.to_csv(Y_TRAIN_DIR, index=False, encoding="utf-8")
y_test_df.to_csv(Y_TEST_DIR, index=False, encoding="utf-8")
y_val_df.to_csv(Y_VAL_DIR, index=False, encoding="utf-8")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2820e488-6f1b-466d-af14-a66826f012e3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>