# ign data set

In [26]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz

# Load CSV
df = pd.read_csv("ign.csv")
df.drop(columns="Unnamed: 0")
# --- Step 1: Remove Exact Duplicates ---
df_step1 = df.drop_duplicates()
print("Step 1 - Exact Duplicates Removed:", len(df) - len(df_step1))

# --- Step 2: Remove same title + platform duplicates ---
df_step2 = df_step1.drop_duplicates(subset=["title", "platform"])
print("Step 2 - Title + Platform Duplicates Removed:", len(df_step1) - len(df_step2))

# --- Helper: Detect sequels (Arabic or Roman numerals) ---
def is_sequel(title):
    if re.search(r'\b\d+\b', title):  # Arabic numerals
        return True
    roman_pattern = r'\b(?=[MDCLXVI]+\b)(M{0,4}(CM|CD|D?C{0,3})?(XC|XL|L?X{0,3})?(IX|IV|V?I{0,3})?)\b'
    return bool(re.search(roman_pattern, title, re.IGNORECASE))

# --- Step 3: Remove fuzzy duplicates using cosine similarity (same platform, ignore sequels) ---
to_remove_cosine = set()
threshold = 0.9

for platform, group in df_step2.groupby('platform'):
    non_sequel_rows = group[~group["title"].apply(is_sequel)]

    if len(non_sequel_rows) < 2:
        continue

    titles = non_sequel_rows["title"].tolist()
    tfidf = TfidfVectorizer().fit_transform(titles)
    similarity = cosine_similarity(tfidf)

    for i in range(len(titles)):
        for j in range(i + 1, len(titles)):
            if similarity[i, j] > threshold:
                to_remove_cosine.add((platform, titles[j]))

# Filter out fuzzy cosine duplicates
mask = df_step2.apply(lambda row: (row['platform'], row['title']) not in to_remove_cosine, axis=1)
df_step3 = df_step2[mask]
removed_fuzzy_cosine = df_step2[~mask]
print("Step 3 - Cosine Similarity (Fuzzy) Duplicates Removed:", len(removed_fuzzy_cosine))

# --- Step 4A: Separate sequels ---
df_sequels = df_step3[df_step3['title'].apply(is_sequel)]
df_originals = df_step3[~df_step3['title'].apply(is_sequel)]
df_sequels.to_csv("sequels_only.csv", index=False)
print("Step 4A - Sequels Separated:", len(df_sequels))

# --- Step 4B: RapidFuzz on original (non-sequel) titles ---
titles = df_originals['title'].unique().tolist()
duplicates_rapidfuzz = set()

for i, title1 in enumerate(titles):
    for title2 in titles[i + 1:]:
        if fuzz.ratio(title1, title2) > 90:
            duplicates_rapidfuzz.add(title2)

df_cleaned_originals = df_originals[~df_originals['title'].isin(duplicates_rapidfuzz)]
removed_rapidfuzz = df_originals[df_originals['title'].isin(duplicates_rapidfuzz)]
print("Step 4B - RapidFuzz Misspelled Duplicates Removed:", len(removed_rapidfuzz))

# --- Final Combine ---
df_final = pd.concat([df_cleaned_originals, df_sequels], ignore_index=True)

# --- Save Outputs ---
removed_fuzzy_cosine.to_csv("removed_fuzzy_cosine.csv", index=False)
removed_rapidfuzz.to_csv("removed_rapidfuzz_titles.csv", index=False)
df_cleaned_originals.to_csv("cleaned_originals.csv", index=False)
df_final.to_csv("final_cleaned_dataset.csv", index=False)
print("Final cleaned dataset saved.")


Step 1 - Exact Duplicates Removed: 0
Step 2 - Title + Platform Duplicates Removed: 48
Step 3 - Cosine Similarity (Fuzzy) Duplicates Removed: 22
Step 4A - Sequels Separated: 5349
Step 4B - RapidFuzz Misspelled Duplicates Removed: 293
Final cleaned dataset saved.


In [25]:
df_final

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,5,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/mac-142565,Macintosh,7.0,Strategy,N,2012,9,11
4,6,Awful,Double Dragon: Neon,/games/double-dragon-neon/xbox-360-131320,Xbox 360,3.0,Fighting,N,2012,9,11
...,...,...,...,...,...,...,...,...,...,...,...
18257,18602,Good,Hitman: Episode 4,/games/hitman-episode-4/xbox-one-20051639,Xbox One,7.4,Shooter,N,2016,8,19
18258,18604,Great,Madden NFL 17,/games/madden-nfl-2017/ps4-20052738,PlayStation 4,8.6,Sports,N,2016,8,17
18259,18606,Okay,Starcraft II: Nova Covert Ops -- Mission Pack 2,/games/starcraft-ii-nova-covert-ops-mission-pa...,PC,6.4,Strategy,N,2016,8,4
18260,18613,Great,XCOM 2: Shen's Last Gift,/games/xcom-2-shens-last-gift/pc-20055520,PC,8.0,Strategy,N,2016,7,1
