# Overview
1. Read CVSS CSV file, clean and sort it, remove all except CVE, Description columns


In [1]:
import pandas as pd
from rapidfuzz import fuzz, process
from tqdm import tqdm
import re
import csv
import unicodedata
from datetime import datetime
import json
from collections import Counter
import gzip

In [2]:
# Input and output file paths
input_file = './CVSSData.csv.gz'
output_file = './cleaned_optimized_fuzzy_deduplicated_file.csv.gz'
removed_file = './removed_duplicates.csv.gz'
output_json_file = './duplicate_info.json.gz'


In [3]:
# Read the CSV file
df = pd.read_csv(input_file,quoting=csv.QUOTE_ALL, escapechar='\\', compression='gzip')
df = df[['CVE', 'Description']]
#df=df[:50000] #test sample
df

Unnamed: 0,CVE,Description
0,CVE-1999-0095,"The debug command in Sendmail is enabled, allo..."
1,CVE-1999-0082,CWD ~root command in ftpd allows root access.
2,CVE-1999-1471,Buffer overflow in passwd in BSD based operati...
3,CVE-1999-1122,Vulnerability in restore in SunOS 4.0.3 and ea...
4,CVE-1999-1467,Vulnerability in rcp on SunOS 4.0.x allows rem...
...,...,...
248446,CVE-2024-8039,Improper permission configurationDomain config...
248447,CVE-2024-8724,The Waitlist Woocommerce ( Back in stock notif...
248448,CVE-2024-8479,The The Simple Spoiler plugin for WordPress is...
248449,CVE-2024-8246,The Post Form – Registration Form – Profile Fo...


In [4]:
def clean_description(text):
    if not isinstance(text, str):
        return ''

    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

    # Convert to lowercase
    text = text.lower()

    # Remove newlines and carriage returns
    text = text.replace('\n', ' ').replace('\r', '')

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    return text


def mark_and_count_dupes(txt_col, threshold=80, window=1000):
    txt_list = txt_col.to_list()
    marked = [True] * len(txt_list)
    duplicate_count = 0
    duplicate_groups = []
    
    for i in tqdm(range(len(txt_list)), desc="Checking for duplicates"):
        if not marked[i]:  # don't check duplicates of text rows marked for removal
            continue
        
        group = [i]
        # Define the window
        start = max(0, i + 1)
        end = min(len(txt_list), i + window + 1)
        
        for j in range(start, end):
            if marked[j]:  # only look through vals not already marked for removal
                if fuzz.ratio(txt_list[i], txt_list[j], score_cutoff=threshold):
                    marked[j] = False  # mark for removal
                    duplicate_count += 1
                    group.append(j)
        
        if len(group) > 1:
            duplicate_groups.append(group)
    
    return marked, duplicate_count, duplicate_groups


def save_duplicate_info(df, duplicate_groups, output_file):
    # Sort duplicate_groups by size (largest to smallest)
    duplicate_groups.sort(key=len, reverse=True)
    
    duplicate_info = []
    for group in duplicate_groups:
        group_info = {
            "group_size": len(group),
            "items": [
                {
                    "index": idx,
                    "cve": df.iloc[idx]['CVE'],
                    "description": df.iloc[idx]['Description'][:200]  # First 200 characters
                } for idx in group
            ]
        }
        duplicate_info.append(group_info)
    
    # Create a list of group sizes
    group_sizes = [len(group) for group in duplicate_groups]
    size_counter = Counter(group_sizes)
    sorted_sizes = sorted(size_counter.items(), key=lambda x: x[0], reverse=True)
    
    result = {
        "total_rows": len(df),
        "total_duplicates": sum(len(group) - 1 for group in duplicate_groups),
        "duplicate_groups": len(duplicate_groups),
        "group_size_distribution": [{"size": size, "count": count} for size, count in sorted_sizes],
        "groups": duplicate_info
    }

    with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(result, f, indent=2)

In [5]:
# Clean the Description column
df['Clean_Description'] = df['Description'].apply(clean_description)

# Remove exact duplicates first, using the cleaned description
df = df.drop_duplicates(subset='Clean_Description', keep='first')
print(f"Shape after removing exact duplicates: {df.shape}")

# Sort the DataFrame by the cleaned description
df = df.sort_values('Clean_Description')

# Reset index for proper functioning of the fuzzy_dedupe function
df = df.reset_index(drop=True)



Shape after removing exact duplicates: (237863, 3)


In [6]:
print("Starting deduplication process...")
chk, dup_count, dup_groups = mark_and_count_dupes(df['Clean_Description'], threshold=80, window=1000)

dfx = df[chk]
print(f"Deduplication complete.")
print(f"Original row count: {len(df)}")
print(f"Rows remaining after deduplication: {len(dfx)}")
print(f"Number of duplicates found: {dup_count}")
print(f"Number of duplicate groups: {len(dup_groups)}")

Starting deduplication process...


Checking for duplicates: 100%|██████████| 237863/237863 [04:00<00:00, 988.57it/s] 

Deduplication complete.
Original row count: 237863
Rows remaining after deduplication: 157158
Number of duplicates found: 80705
Number of duplicate groups: 21429





In [7]:
# see the removed duplicates:
duplicates = df[~pd.Series(chk)]
print(f"Number of duplicates removed: {len(duplicates)}")
# Save the duplicates
duplicates.to_csv(removed_file, quoting=csv.QUOTE_ALL, escapechar='\\', compression='gzip')


# Optionally, save the deduplicated DataFrame
dfx.to_csv(output_file, quoting=csv.QUOTE_ALL, escapechar='\\', compression='gzip')



# Save duplicate information to file
save_duplicate_info(df, dup_groups, output_json_file)
print(f"Duplicate information saved to {output_json_file}")


Number of duplicates removed: 80705
Duplicate information saved to ./duplicate_info.json.gz
