In [6]:
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

# Load the files
with open("Assignment2/output/output_rdd.txt", "r") as f:
    rdd_lines = f.readlines()

with open("Assignment1/output/output.txt", "r") as f:
    mr_lines = f.readlines()

# Updated parser for MR lines with correct quote handling
def parse_mr_line_corrected(line):
    line = line.strip()
    if not line:
        return None, set()
    if line.startswith('"') and '"' in line[1:]:
        try:
            first_quote_end = line.find('"', 1)
            category = line[1:first_quote_end]
            term_string = line[first_quote_end + 2:].strip().strip('"')
            terms = [term.split(":")[0] for term in term_string.split() if ":" in term]
            return category, set(terms)
        except Exception:
            return None, set()
    return None, set()

# Apply corrected parsing
mr_data_corrected = dict(parse_mr_line_corrected(line) for line in mr_lines if line.strip())

# Re-compare with RDD data
comparison_fixed = []
all_categories = sorted(set(rdd_data.keys()) & set(mr_data_corrected.keys()))
for category in all_categories:
    rdd_terms = rdd_data[category]
    mr_terms = mr_data_corrected[category]
    intersection = rdd_terms & mr_terms
    union = rdd_terms | mr_terms
    jaccard = len(intersection) / len(union) if union else 0
    comparison_fixed.append({
        "Category": category,
        "RDD_Term_Count": len(rdd_terms),
        "MR_Term_Count": len(mr_terms),
        "Common_Terms": len(intersection),
        "Jaccard_Similarity": round(jaccard, 3)
    })

df_comparison_fixed = pd.DataFrame(comparison_fixed)

print(df_comparison_fixed)


                      Category  RDD_Term_Count  MR_Term_Count  Common_Terms  \
0             Apps_for_Android              75             75            52   
1                   Automotive              75             75            50   
2                         Baby              75             75            56   
3                       Beauty              75             75            59   
4                         Book              75             75            53   
5                CDs_and_Vinyl              75             75            57   
6   Cell_Phones_and_Accessorie              75             75            57   
7   Clothing_Shoes_and_Jewelry              75             75            64   
8                Digital_Music              75             75            23   
9                   Electronic              75             75            61   
10    Grocery_and_Gourmet_Food              75             75            50   
11    Health_and_Personal_Care              75      

In [2]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0
