In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

In [None]:
train_df = pd.read_csv("train-por.csv")
dev_df = pd.read_csv("dev-por.csv")
test_df = pd.read_csv("test-por.csv")

train_df.info()
dev_df.info()
test_df.info()

In [None]:
print("Train Dataset:")
display(train_df.head())

print("Dev Dataset:")
display(dev_df.head())

print("Test Dataset:")
display(test_df.head())

In [None]:
train_posts = set(train_df["post"])
dev_posts = set(dev_df["post"])
test_posts = set(test_df["post"])

train_dev_intersection = train_posts.intersection(dev_posts)
train_test_intersection = train_posts.intersection(test_posts)
dev_test_intersection = dev_posts.intersection(test_posts)

train_dev_test_intersection = train_posts.intersection(dev_posts, test_posts)

print(f"Number of repeated posts between Train and Dev: {len(train_dev_intersection)}")
print(f"Number of repeated posts between Train and Test: {len(train_test_intersection)}")
print(f"Number of repeated posts between Dev and Test: {len(dev_test_intersection)}")
print(f"Number of repeated posts between Train, Dev and Test: {len(train_dev_test_intersection)}")

In [None]:
train_dev = len(train_dev_intersection)
train_test = len(train_test_intersection)
dev_test = len(dev_test_intersection)
train_dev_test = len(train_dev_test_intersection)

train_size = len(train_posts)
dev_size = len(dev_posts)
test_size = len(test_posts)

venn_colors = {'Train': 'red', 'Dev': 'blue', 'Test': 'green'}

plt.figure(figsize=(6, 6))
venn = venn3(
    subsets=(train_size, dev_size, train_dev, test_size, train_test, dev_test, train_dev_test),
    set_labels=('Train', 'Dev', 'Test'),
    set_colors=(venn_colors['Train'], venn_colors['Dev'], venn_colors['Test'])
)

plt.title("Venn diagram of intersections between the datasets")
plt.show()

In [None]:
print("Examples of repeated posts between Train and Dev:")
print(list(train_dev_intersection)[:5])

print("\nExamples of repeated posts between Train and Test:")
print(list(train_test_intersection)[:5])

print("\nExamples of repeated posts between Dev and Test:")
print(list(dev_test_intersection)[:5])

In [12]:
all_unique_posts = train_posts.union(dev_posts, test_posts)

print(f"Total de posts únicos após remoção das duplicatas: {len(all_unique_posts)}")

Total de posts únicos após remoção das duplicatas: 1944


In [None]:
# If a post is in Test, it is kept; if not, it is kept in Dev; otherwise, it is kept in Train.

filtered_test_df = test_df[test_df["post"].isin(all_unique_posts)]
filtered_dev_df = dev_df[~dev_df["post"].isin(filtered_test_df["post"])]
filtered_train_df = train_df[
    ~train_df["post"].isin(filtered_test_df["post"]) &
    ~train_df["post"].isin(filtered_dev_df["post"])
]

print(f"Train size after removal: {len(filtered_train_df)}")
print(f"Dev size after removal: {len(filtered_dev_df)}")
print(f"Test size after removal: {len(filtered_test_df)}")


In [None]:
filtered_train_df.to_csv("train_por_overlap.csv", index=False)
filtered_dev_df.to_csv("dev_por_overlap.csv", index=False)
filtered_test_df.to_csv("test_por_overlap.csv", index=False)

print("New CSV files saved successfully!")

In [None]:
sizes = [len(filtered_train_df), len(filtered_dev_df), len(filtered_test_df)]
datasets = ["Train", "Dev", "Test"]

plt.figure(figsize=(6, 6))
plt.pie(
    sizes,
    labels=datasets,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops={'edgecolor': 'black'}
)

plt.title("Distribution of Data After Removing Duplicates")
plt.show()