# Analyze data

# 1. Imports

## 1.1 Packages

In [1]:
import os
import pandas as pd

import re

from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm



## 1.2 Options

In [2]:
path_data = "../data/01_raw"
path_data_train = os.path.join(path_data, "train")
path_data_test = os.path.join(path_data, "test")

## 1.3 Datasets

# 2. Analyze data

In [3]:
df = pd.read_csv(os.path.join(path_data, "train.csv"), index_col=0)

In [4]:
df

Unnamed: 0_level_0,real_text_id
id,Unnamed: 1_level_1
0,1
1,2
2,1
3,2
4,2
...,...
90,2
91,1
92,2
93,2


In [9]:
def generate_dataset_train(df: pd.DataFrame, path_data: str):
    """Generate a dataset from the DataFrame and text files in the specified directory.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'id', 'text1', and 'text2' columns.
        path_data (str): The path to the directory containing the text files.

    Returns:
        Yield dictionaries of (id, text1, text2).
    """
    for _, row in df.iterrows():
        folder_id = row['real_text_id']
        folder_path = os.path.join(path_data, f"article_{folder_id:04d}")

        file1_path = os.path.join(folder_path, "file_1.txt")
        file2_path = os.path.join(folder_path, "file_2.txt")

        with open(file1_path, encoding="utf-8") as f1:
            text1 = f1.read()
        with open(file2_path, encoding="utf-8") as f2:
            text2 = f2.read()
    
        yield {
            "id": folder_id,
            "text1": text1,
            "text2": text2
        }


def generate_dataset_test(path_data: str):
    """Generate a dataset from the text files in the specified directory.
    
    Args:
        path_data (str): The path to the directory containing the text files.

    Returns:
        Yield dictionaries of (text1, text2).
    """
    # Get list of folders matching the pattern "article_"
    folders = sorted([
        f for f in os.listdir(path_data)
        if os.path.isdir(os.path.join(path_data, f)) and re.match(r'article_\d+', f)
    ])

    # Get text files in each folder
    for folder in folders:
        folder_id = int(folder.split('_')[1])
        folder_path = os.path.join(path_data, folder)

        file1_path = os.path.join(folder_path, "file_1.txt")
        file2_path = os.path.join(folder_path, "file_2.txt")

        with open(file1_path, encoding="utf-8") as f1:
            text1 = f1.read()
        with open(file2_path, encoding="utf-8") as f2:
            text2 = f2.read()
    
        yield {
            "id": folder_id,
            "text1": text1,
            "text2": text2
        }

In [10]:
train_dataset = Dataset.from_generator(lambda: generate_dataset_train(df, path_data_train))
test_dataset = Dataset.from_generator(lambda: generate_dataset_test(path_data_test))

Generating train split: 95 examples [00:00, 7976.52 examples/s]
Generating train split: 1068 examples [00:00, 7411.46 examples/s]


In [11]:
train_dataset

Dataset({
    features: ['id', 'text1', 'text2'],
    num_rows: 95
})