In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from lxml import etree
from plotly import graph_objects as go

## wczytanie danych ze StackOverflow

In [2]:
DATA_PATH = Path("ai.stackexchange.com")  # https://archive.org/details/stackexchange

def load_xml(path):
    tree = etree.parse(path)
    records = [dict(x.attrib) for x in tree.getroot().iterchildren()]
    data = pd.DataFrame.from_records(records).drop(columns=["TagBased"])

    return data


def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    data["Date"] = pd.to_datetime(data["Date"])
    data = data[data["Date"] >= "2021-01-01"]
    data["Class"] = data.loc[:, "Class"].astype("category")
    data["UserId"] = data.loc[:, "UserId"].astype("category")

    return data.reset_index(drop=True)

In [3]:
badges_df = preprocess(load_xml(DATA_PATH / "Badges.xml"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Class"] = data.loc[:, "Class"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["UserId"] = data.loc[:, "UserId"].astype("category")


### opis kolumn

- `Id`: id wydarzenia,
- `UserId`: id użytkownika,
- `Name`: nazwa odznaki (*badge*),
- `Date`: data nadania odznaki,
- `Class`: ranga odznaki (im mniejsza tym bardziej unikalna).

In [4]:
badges_df.head()

Unnamed: 0,Id,UserId,Name,Date,Class
0,40869,19524,Quorum,2021-01-01 02:01:25.147,3
1,40870,44,Talkative,2021-01-01 02:01:25.147,3
2,40871,9235,Talkative,2021-01-01 02:01:25.147,3
3,40872,18028,Autobiographer,2021-01-01 02:36:25.170,3
4,40873,35616,Commentator,2021-01-01 04:05:15.497,3


### top użytkowników pod względem liczby odznak w latach 2021-2022

In [5]:
fig = badges_df.value_counts("UserId")[:10].plot(
    kind="bar", backend="plotly", title="Top 10 users with most badges"
)
fig.update_layout(xaxis_title="UserId", yaxis_title="Number of badges", showlegend=False)


### liczba otrzymywanych odznak w czasie dla 5 top użytkowników

In [6]:
def plot_user_badges(data: pd.DataFrame, user_ids: list[int]):
    selected_users = data.loc[data["UserId"].isin(user_ids), :].reset_index(drop=True)
    fig = go.Figure()

    for user_id, user_badges in selected_users.groupby(by="UserId", observed=True):
        user_badges = user_badges.resample("M", on="Date").count()
        fig.add_trace(
            go.Scatter(
                x=user_badges.index,
                y=user_badges["Id"],
                name=user_id,
                mode="lines+markers",
            )
        )

    fig.update_layout(
        title="Number of badges per month",
        xaxis_title="Time", yaxis_title="Number of badges", legend_title="UserId"
    )

    return fig

selected_user_ids = badges_df.value_counts("UserId")[:5].index.tolist()
plot_user_badges(badges_df, selected_user_ids)


### export wybranych użytkowników do formy przyjaznej dla evently

In [7]:
def export_user(data: pd.DataFrame, user_id: int):
    user_data = data.loc[data["UserId"] == user_id, :].reset_index(drop=True)
    user_data = user_data.sort_values(by="Date").reset_index(drop=True)
    
    time_diffs = [0]
    for idx, date in enumerate(user_data.loc[1:, "Date"]):
        diff = date - user_data.loc[idx, "Date"]
        time_diffs.append(diff.total_seconds())

    scale_factor = 24*60*60
    user_data["TimeDiff"] = time_diffs
    user_data["time"] = user_data["TimeDiff"].cumsum() / scale_factor
    user_data["magnitude"] = 4 - user_data["Class"].astype(int)


    user_data.to_csv(DATA_PATH / f"user_{user_id}.csv", index=False)

In [8]:
export_user(badges_df, "2444")
export_user(badges_df, "1847")