In [None]:
!git clone https://gitlab.com/checkthat_lab/clef2025-checkthat-lab.git

In [None]:
%cd clef2025-checkthat-lab/task2/data

In [None]:
import os
import re
import json

from matplotlib.ticker import PercentFormatter
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib
import numpy as np
import pandas as pd
import zipfile

pd.set_option('max_colwidth', 300)

In [None]:
train_path = "train/train-por.csv"
dev_path = "dev/dev-por.csv"
test_path = "test/test-por.csv"

train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

In [None]:
dev_df.head()

In [None]:
test_df.head()

In [None]:
def remove_repetitions(text):
    if isinstance(text, str):
        text = re.sub(r'\s*None\s*$', '', text)

        words = text.split()

        for size in range(1, len(words) // 2 + 1):
            pattern = words[:size]
            repeats = len(words) // size

            if words[:size * repeats] == pattern * repeats:
                return " ".join(pattern)

    return text

In [None]:
train_df['post'] = train_df['post'].apply(remove_repetitions)
dev_df['post'] = dev_df['post'].apply(remove_repetitions)
test_df['post'] = test_df['post'].apply(remove_repetitions)

train_cleaned_path = "train/train_cleaned.csv"
dev_cleaned_path = "dev/dev_cleaned.csv"
test_cleaned_path = "test/test_cleaned.csv"

train_df.to_csv(train_cleaned_path, index=False)
dev_df.to_csv(dev_cleaned_path, index=False)
test_df.to_csv(test_cleaned_path, index=False)

print("Corrected files saved successfully!")
print(f"Train: {train_cleaned_path}")
print(f"Dev: {dev_cleaned_path}")
print(f"Test: {test_cleaned_path}")

In [None]:
train_lengths = train_df.map(lambda x: len(str(x).split()))
dev_lengths = dev_df.map(lambda x: len(str(x).split()))

In [None]:
train_lengths.describe().round(1)

In [None]:
dev_lengths.describe().round(1)

In [None]:
def count_none_occurrences(file_path):
    df = pd.read_csv(file_path)
    none_count = df['post'].astype(str).apply(lambda x: x.strip().lower().endswith('none')).sum()
    none_count /= df.shape[0]
    none_count *= 100
    none_count = round(none_count, 2)

    return none_count

train_none_count = count_none_occurrences(train_path)
dev_none_count = count_none_occurrences(dev_path)
test_none_count = count_none_occurrences(test_path)

print("Occurrences of 'None' at the end of the 'post' column:")
print(f"Train: {train_none_count}")
print(f"Dev: {dev_none_count}")
print(f"Test: {test_none_count}")

In [None]:
def make_histogram(column):
  median = pd.concat([train_lengths[column], dev_lengths[column]]).median()
  fig = plt.figure(figsize=(4.5, 4.5))

  plt.ylabel("Percentage")
  plt.xlabel("Word count")
  matplotlib.rcParams.update({'font.size': 12})

  plt.plot()
  plt.hist(train_lengths[column], bins=30, alpha=0.7, label="Train", density=True)
  plt.hist(dev_lengths[column], bins=30, alpha=0.7, label="Dev", density=True)
  plt.axvline(median, color='k', linestyle='dashed', linewidth=1, label=f"Median = {median}")
  fig.gca().yaxis.set_major_formatter(PercentFormatter(1))
  plt.legend()
  plt.tight_layout()
  plt.savefig(f"{column}.pdf")

In [None]:
make_histogram("post")
plt.show()

In [None]:
make_histogram("normalized claim")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
bars = plt.bar(["Train", "Validation", "Test"], [len(train_df), len(dev_df), len(test_df)], color=["blue", "orange", "green"])
ax.bar_label(bars)
plt.ylabel("Number of Examples")
plt.title("Comparison of Number of Examples per Set")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot([
    train_df["post"].apply(lambda x: len(str(x).split())),
    train_df["normalized claim"].apply(lambda x: len(str(x).split())),
    dev_df["post"].apply(lambda x: len(str(x).split())),
    dev_df["normalized claim"].apply(lambda x: len(str(x).split()))
], labels=["Train - Post", "Train - Claim", "Validation - Post", "Validation - Claim"])

plt.ylabel("Number of Words")
plt.title("Distribution of Post and Claim Lengths")
plt.show()
