# Testing ManifestoChunkAnalysis to Pandas DataFrame conversion
This notebook evaluates different methods to convert a list of `ManifestoChunkAnalysis` objects into a Pandas DataFrame.

## Setup
### Import libraries

In [None]:
import random
import pandas as pd
from polids.structured_analysis.base import (
    ManifestoChunkAnalysis,
    HateSpeechDetection,
    PoliticalCompass,
)

In [None]:
from polids.utils.pandas import convert_pydantic_to_dataframe

### Define helper functions

In [None]:
def generate_random_manifesto_chunk() -> ManifestoChunkAnalysis:
    """Generate a random ManifestoChunkAnalysis object."""
    return ManifestoChunkAnalysis(
        policy_proposals=[f"Proposal {i}" for i in range(random.randint(1, 5))],
        sentiment=random.choice(["positive", "negative", "neutral"]),
        topic=random.choice(
            [
                "economy",
                "healthcare",
                "education",
                "migration",
                "transport",
                "science",
                "sustainability",
                "welfare",
                "social causes",
                "ideology",
            ]
        ),
        hate_speech=HateSpeechDetection(
            hate_speech=random.choice([True, False]),
            reason="Randomly generated reason.",
            targeted_groups=random.sample(
                ["race", "religion", "gender identity", "disability"],
                k=random.randint(0, 2),
            ),
        ),
        political_compass=PoliticalCompass(
            economic=random.choice(["left", "center", "right"]),
            social=random.choice(["libertarian", "center", "authoritarian"]),
        ),
    )

## Generate data
Create a large dataset of `ManifestoChunkAnalysis` objects for testing.

In [None]:
data = [generate_random_manifesto_chunk() for _ in range(4 * 10**5)]
data[:5]

## Test conversion method

In [None]:
df = pd.DataFrame(s.model_dump() for s in data)  # 2 seconds
df

In [None]:
new_df = df.copy()
for col in list(df.columns):
    if isinstance(df[col].iloc[0], dict):
        # Normalize the column if it contains dictionaries
        expanded_dict_df = pd.json_normalize(new_df[col])
        # Rename the columns to avoid duplicates
        expanded_dict_df.columns = [
            f"{col}_{sub_col}" for sub_col in expanded_dict_df.columns
        ]
        # Concatenate the expanded DataFrame with the original DataFrame
        # and drop the original column
        new_df = pd.concat([new_df.drop(columns=[col]), expanded_dict_df], axis=1)
new_df

## Implemented solution

In [None]:
solution_df = convert_pydantic_to_dataframe(data)
solution_df