# Filter the matches (subreddit to country) (skip if using separated files)
The matches are put in separate files based on whether the matches are approved, rejected, group(eg. europe or commonwealth) or still unsure (will require further checking)

In [None]:
import pandas as pd

# Read the file we have manually reviewed (never write to it here!)
matches_reviewed = pd.read_csv("data/subreddit_country_matches_corrected.csv")

# Check how many of each status we have
print(matches_reviewed["status"].value_counts())


status
approved    1284
rejected     276
unsure       132
group         20
Name: count, dtype: int64


# Filter matches using the separated files (skip if using masterfile)
When updates are made to the separated files, we’ll read those instead of the main file.
Afterward, we’ll split the data again into their respective files based on the latest updates.

In [None]:
import pandas as pd

approved_df = pd.read_csv("data/subreddit_matches_approved.csv")
rejected_df = pd.read_csv("data/subreddit_matches_rejected.csv")
unsure_df   = pd.read_csv("data/subreddit_matches_unsure.csv")
group_df    = pd.read_csv("data/subreddit_matches_group.csv")


In [None]:
# Combine all DataFrames
matches_all = pd.concat([approved_df, rejected_df, unsure_df, group_df], ignore_index=True)

# Check what statuses exist now
print(matches_all["status"].value_counts())


status
approved    1311
rejected     280
unsure       132
group         20
Name: count, dtype: int64


# Write the mappings to separate files
Each file will contain the matches of the correct status based on the manual screening.

In [None]:
# Separate by category
approved_df = matches_all.query("status == 'approved'")
rejected_df = matches_all.query("status == 'rejected'")
unsure_df   = matches_all.query("status == 'unsure'")
group_df    = matches_all.query("status == 'group'")

# This code is commented out to avoid overwriting files unintentionally
# # Save to separate files
# approved_df.to_csv("subreddit_matches_approved.csv", index=False)
# rejected_df.to_csv("subreddit_matches_rejected.csv", index=False)
# unsure_df.to_csv("subreddit_matches_unsure.csv", index=False)
# group_df.to_csv("subreddit_matches_group.csv", index=False)