In [24]:
from justatom.storing.polars import POLARStore

import os
from pathlib import Path
import polars as pl
from typing import Optional, Union, List, Dict
from loguru import logger

In [2]:
filepath = Path.home() / "IDataset" / "yappi_hackaton_2024_400k.csv"

In [3]:
docs_df = pl.read_csv(filepath)

In [4]:
docs_df = docs_df.filter(pl.col("description").is_not_null())

In [5]:
logger.info(f"There're total {docs_df.shape[0]} samples")

[32m2024-06-10 04:19:46.449[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mThere're total 345081 samples[0m


In [6]:
store = POLARStore(df=docs_df)

In [16]:
pl_observation = pl.concat(store.random_sample(sample_size=1_000, num_obs=10))

In [19]:
stopchars = "🤫!@#$&*~`,"
column = "description"

In [21]:
pl_counts_per_word = store.count_words_per_col(col=column, stopchars=stopchars)

In [22]:
pl_top = pl_counts_per_word.filter(pl.col("counts") >= 1_000)

In [23]:
pl_top.head()

words,counts
str,u32
"""boobs""",36802
"""красивыедевушк…",36064
"""бьютирутина""",28271
"""наспорте""",28061
"""спорт""",27550


In [25]:
def visualize_as_bar(
    arr: List[Dict],
    logo_path: Optional[Union[str, Path]] = None,
    logo_text: Optional[str] = None,
    x_axis_title: str = "X",
    title: str = None,
    y_axis_title: str = "Y",
    logo_text_size: int = 7):

    import plotly.graph_objects as go

    title = f"Distribution {y_axis_title}={y_axis_title}({x_axis_title})"
    
    xs = [xi.get("name", "None") for xi in arr]
    vs = [xi.get("value", 0) for xi in arr]

    bar_color = 'rgba(253, 246, 48, 0.4)',
    border_color = 'rgba(253, 246, 48, 1)'
    text_color = 'rgba(255, 255, 255, 1)'   # Неоново-желтый цвет
    grid_color = 'rgba(253, 246, 48, 0.2)' # Неоново-желтый цвет

    fig = go.Figure([go.Bar(
        x=xs, y=vs,
        marker=dict(color=bar_color, line=dict(color=border_color, width=2))
        )
    ])

    fig.update_layout(
        title=title,
        title_font=dict(size=24, color=text_color),
        paper_bgcolor='rgba(0, 0, 0, 1)',
        plot_bgcolor='rgba(0, 0, 0, 1)',
        xaxis_title=x_axis_title,
        yaxis_title=y_axis_title,
        xaxis=dict(tickangle=-45, color=text_color, title_font=dict(size=18),
                   tickfont=dict(size=10)),
        yaxis=dict(color=text_color, title_font=dict(size=18)),
        font=dict(family="Courier New, monospace", size=15, color=text_color)
    )

    # Добавление сетки в стиле киберпанк
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor=grid_color)
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor=grid_color)

    if logo_path:
        import base64
        with open(str(logo_path), "rb") as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode()
        
        fig.add_layout_image(
            dict(
                source="data:image/png;base64," + encoded_image,
                xref="paper",
                yref="paper",
                x=1,
                y=1.05,
                sizex=0.2,
                sizey=0.2,
                xanchor="right",
                yanchor="bottom"
            )
        )
        if logo_text:
            fig.add_annotation(
                x=1,
                y=1.05,
                text=logo_text,
                showarrow=False,
                xref="paper", yref="paper",
                xanchor="right",
                yanchor="top",
                font=dict(size=logo_text_size, color="yellow")
            )

    return fig

In [28]:
xs, ys = pl_top.select("words").to_series().to_list(), pl_top.select("counts").to_series().to_list()

In [29]:
fig = visualize_as_bar(arr=[{"name": xi, "value": yi} for xi, yi in zip(xs, ys)], x_axis_title="Word", y_axis_title="СOUNT", logo_path=Path(os.getcwd()) / ".data" / "polaroids.ai.logo.png", logo_text="Powered by polaroids.ai")

In [31]:
width, height, dpi = None, None, 360
fig.write_image("counts(word)_curve.png", format='png', width=width, height=height, scale=dpi/72, engine='kaleido')