In [None]:
!pip install pandas jsonlines

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json

In [None]:
# prompt: Convert a .json.gz into a pandas dataframe

import pandas as pd
import jsonlines
import gzip # Import the gzip module

def jsonl_to_dataframe(file_path):
  """Converts a JSONL file to a pandas DataFrame.

  Args:
    file_path: The path to the JSONL file.

  Returns:
    A pandas DataFrame representing the data in the JSONL file.
  """
  data = []
  # Open the gzipped file using gzip.open()
  with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    # Read the file line by line
    for line in f:
      # Parse each line as JSON and append to data list
      data.append(json.loads(line))
  return pd.DataFrame(data)

# Example usage (replace with your actual file path)
file_path = "/content/drive/MyDrive/WDC/20pair/wdcproducts20cc80rnd000un_train_medium.json.gz" # Example path, update as needed
# file_path_test = "/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_valid_medium.json.gz"
# Assuming jsonl_to_dataframe is already defined and working
df = jsonl_to_dataframe(file_path)

# Append test DataFrame to train DataFrame
# df = pd.concat([df_train, df_test], ignore_index=True)

# Optional: check the result
print(df.shape)
print(df.head())


In [None]:
df_test = jsonl_to_dataframe("/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_valid_medium.json.gz")


In [None]:
len(df_train)

In [None]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np

# Step 1: Exchange rate mapping (to EUR)
currency_to_eur = {
    'EUR': 1.0, 'eur': 1.0,
    'GBP': 1.17,
    'USD': 0.92,
    'CAD': 0.68,
    'AUD': 0.61,
    'NOK': 0.087,
    'HUF': 0.0026,
    'PLN': 0.23,
    'RON': 0.20,
    'ZAR': 0.049,
    'SEK': 0.086,
    'DKK': 0.13,
    'PKR': 0.0033,
    'UAH': 0.023,
    'PHP': 0.016,
    'MYR': 0.20,
    'AED': 0.25,
    'BRL': 0.18,
    'CZK': 0.04, 'czk': 0.04,
    'SGD': 0.68,
    'VND': 0.000037,
    'NZD': 0.57,
    'GYD': 0.0044,
    'HRK': 0.13,
    'CHF': 1.03,
    'BGN': 0.51,
    'ILS': 0.25,
    'ISK': 0.0065,
    'INR': 0.011,
    'KWD': 2.99,
    'ARS': 0.0011,
    'COP': 0.00023,
    'HKD': 0.12,
    'LEI': 0.20,  # Lei assumed same as RON
    'CLP': 0.0010,
    'RUB': 0.0098,
    'IDR': 0.000059,
    'GHS': 0.065,
    'OMR': 2.4,
    'MKD': 0.016,
    'TRY': 0.028,
    'MDL': 0.052,
    'TND': 0.30,
    'TZS': 0.00035,
    'MXN': 0.052,
    'MUR': 0.020,
    'ALL': 0.0092,
    'THB': 0.025,
    'EGP': 0.019,
    'KZT': 0.0021,
    'BAM': 0.51,
    'NAD': 0.049,
    'BMD': 0.92,  # Assume USD
    'BDT': 0.0084,
    'JPY': 0.0062,
    'LBP': 0.0000061,
    'TL': 0.028,   # Turkish Lira
    'QAR': 0.25,
    'SAR': 0.24,
    'BHD': 2.44,
    'JOD': 1.30,
    'XOF': 0.0015,
    'NGN': 0.00059,
    'ABC': np.nan,  # ABC not a real currency
}

# Step 2: Helper to clean price value
def clean_price(price):
    try:
        if pd.isna(price):
            return np.nan
        return float(price)
    except:
        return np.nan

# Step 3: Helper to normalize price to EUR
def normalize_price(price, currency):
    if pd.isna(price) or pd.isna(currency):
        return np.nan
    rate = currency_to_eur.get(currency.strip(), None)
    if rate is None or pd.isna(rate):
        return np.nan  # Unknown currency
    return price * rate

# Step 4: Apply cleaning
df['price_left'] = df['price_left'].apply(clean_price)
df['price_right'] = df['price_right'].apply(clean_price)

df['price_left_norm'] = df.apply(lambda row: normalize_price(row['price_left'], row['priceCurrency_left']), axis=1)
df['price_right_norm'] = df.apply(lambda row: normalize_price(row['price_right'], row['priceCurrency_right']), axis=1)

# Step 5: Handle missing prices smartly
def price_difference(row):
    p_left = row['price_left_norm']
    p_right = row['price_right_norm']
    if pd.notna(p_left) and pd.notna(p_right) and p_left > 0 and p_right > 0:
        diff = abs(p_left - p_right)
        max_price = max(p_left, p_right)
        return diff / (max_price + 1e-5)
    else:
        # If any price is missing or invalid, assume maximum difference
        return 1.0

df['price_diff_norm'] = df.apply(price_difference, axis=1)



In [None]:
df

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Load your data

# 2. Preprocess: Combine title and description for left and right products
def combine_text(row, side):
    title = str(row[f'title_{side}']) if pd.notna(row[f'title_{side}']) else ''
    desc = str(row[f'description_{side}']) if pd.notna(row[f'description_{side}']) else ''
    return title + " " + desc

df['text_left'] = df.apply(lambda row: combine_text(row, 'left'), axis=1)
df['text_right'] = df.apply(lambda row: combine_text(row, 'right'), axis=1)

# 3. Build TF-IDF features
tfidf = TfidfVectorizer(stop_words='english')

all_text = pd.concat([df['text_left'], df['text_right']])
tfidf.fit(all_text)

tfidf_left = tfidf.transform(df['text_left'])
tfidf_right = tfidf.transform(df['text_right'])

# 4. Compute Cosine Similarity
cos_sim = [cosine_similarity(tfidf_left[i], tfidf_right[i])[0][0] for i in range(tfidf_left.shape[0])]



In [None]:
# 6. Brand Match Feature
def brand_match(row):
    return int(str(row['brand_left']).lower().strip() == str(row['brand_right']).lower().strip())

df['brand_match'] = df.apply(brand_match, axis=1)

# 7. Assemble final feature matrix
features = pd.DataFrame({
    'cosine_similarity': cos_sim,
    'price_diff_norm': df['price_diff_norm'],
    'brand_match': df['brand_match'],
})

# Drop rows with missing values (optional)
features = features.dropna()
labels = df.loc[features.index, 'label']

# 8. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.36, random_state=42)

# 9. Train a simple Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# 10. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np

# Step 1: Exchange rate mapping (to EUR)
currency_to_eur = {
    'EUR': 1.0, 'eur': 1.0,
    'GBP': 1.17,
    'USD': 0.92,
    'CAD': 0.68,
    'AUD': 0.61,
    'NOK': 0.087,
    'HUF': 0.0026,
    'PLN': 0.23,
    'RON': 0.20,
    'ZAR': 0.049,
    'SEK': 0.086,
    'DKK': 0.13,
    'PKR': 0.0033,
    'UAH': 0.023,
    'PHP': 0.016,
    'MYR': 0.20,
    'AED': 0.25,
    'BRL': 0.18,
    'CZK': 0.04, 'czk': 0.04,
    'SGD': 0.68,
    'VND': 0.000037,
    'NZD': 0.57,
    'GYD': 0.0044,
    'HRK': 0.13,
    'CHF': 1.03,
    'BGN': 0.51,
    'ILS': 0.25,
    'ISK': 0.0065,
    'INR': 0.011,
    'KWD': 2.99,
    'ARS': 0.0011,
    'COP': 0.00023,
    'HKD': 0.12,
    'LEI': 0.20,  # Lei assumed same as RON
    'CLP': 0.0010,
    'RUB': 0.0098,
    'IDR': 0.000059,
    'GHS': 0.065,
    'OMR': 2.4,
    'MKD': 0.016,
    'TRY': 0.028,
    'MDL': 0.052,
    'TND': 0.30,
    'TZS': 0.00035,
    'MXN': 0.052,
    'MUR': 0.020,
    'ALL': 0.0092,
    'THB': 0.025,
    'EGP': 0.019,
    'KZT': 0.0021,
    'BAM': 0.51,
    'NAD': 0.049,
    'BMD': 0.92,  # Assume USD
    'BDT': 0.0084,
    'JPY': 0.0062,
    'LBP': 0.0000061,
    'TL': 0.028,   # Turkish Lira
    'QAR': 0.25,
    'SAR': 0.24,
    'BHD': 2.44,
    'JOD': 1.30,
    'XOF': 0.0015,
    'NGN': 0.00059,
    'ABC': np.nan,  # ABC not a real currency
}

# Step 2: Helper to clean price value
def clean_price(price):
    try:
        if pd.isna(price):
            return np.nan
        return float(price)
    except:
        return np.nan

# Step 3: Helper to normalize price to EUR
def normalize_price(price, currency):
    if pd.isna(price) or pd.isna(currency):
        return np.nan
    rate = currency_to_eur.get(currency.strip(), None)
    if rate is None or pd.isna(rate):
        return np.nan  # Unknown currency
    return price * rate

# Step 4: Apply cleaning
df_test['price_left'] = df_test['price_left'].apply(clean_price)
df_test['price_right'] = df_test['price_right'].apply(clean_price)

df_test['price_left_norm'] = df_test.apply(lambda row: normalize_price(row['price_left'], row['priceCurrency_left']), axis=1)
df_test['price_right_norm'] = df_test.apply(lambda row: normalize_price(row['price_right'], row['priceCurrency_right']), axis=1)

# Step 5: Handle missing prices smartly
def price_difference(row):
    p_left = row['price_left_norm']
    p_right = row['price_right_norm']
    if pd.notna(p_left) and pd.notna(p_right) and p_left > 0 and p_right > 0:
        diff = abs(p_left - p_right)
        max_price = max(p_left, p_right)
        return diff / (max_price + 1e-5)
    else:
        # If any price is missing or invalid, assume maximum difference
        return 1.0

df_test['price_diff_norm'] = df_test.apply(price_difference, axis=1)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Load your data

# 2. Preprocess: Combine title and description for left and right products
def combine_text(row, side):
    title = str(row[f'title_{side}']) if pd.notna(row[f'title_{side}']) else ''
    desc = str(row[f'description_{side}']) if pd.notna(row[f'description_{side}']) else ''
    return title + " " + desc

df_test['text_left'] = df_test.apply(lambda row: combine_text(row, 'left'), axis=1)
df_test['text_right'] = df_test.apply(lambda row: combine_text(row, 'right'), axis=1)

# 3. Build TF-IDF features
tfidf = TfidfVectorizer(stop_words='english')

all_text = pd.concat([df_test['text_left'], df_test['text_right']])
tfidf.fit(all_text)

tfidf_left = tfidf.transform(df_test['text_left'])
tfidf_right = tfidf.transform(df_test['text_right'])

# 4. Compute Cosine Similarity
cos_sim_test = [cosine_similarity(tfidf_left[i], tfidf_right[i])[0][0] for i in range(tfidf_left.shape[0])]

# Step 1: Process train data
df['brand_match'] = df.apply(brand_match, axis=1)

features_train = pd.DataFrame({
    'cosine_similarity': cos_sim,
    'price_diff_norm': df['price_diff_norm'],
    'brand_match': df['brand_match'],
})
features_train = features_train.dropna()
labels_train = df.loc[features_train.index, 'label']

# Step 2: Process test data
df_test['brand_match'] = df_test.apply(brand_match, axis=1)

features_test = pd.DataFrame({
    'cosine_similarity': cos_sim_test,  # You need to compute this separately for df_test
    'price_diff_norm': df_test['price_diff_norm'],
    'brand_match': df_test['brand_match'],
})
features_test = features_test.dropna()
labels_test = df_test.loc[features_test.index, 'label']

# Step 3: Train on df
model = LogisticRegression()
model.fit(features_train, labels_train)

# Step 4: Evaluate on df_test
y_pred = model.predict(features_test)
print(classification_report(labels_test, y_pred))




In [None]:
# Step 4: Evaluate
y_pred = model.predict(features_test)

# Get misclassified examples: true label = 1, predicted = 0 (false negatives)
false_negatives_idx = (labels_test == 1) & (y_pred == 0)

# Get original examples from df_test (resetting index alignment if needed)
df_test_reset = df_test.reset_index(drop=True)
misclassified = df_test_reset[false_negatives_idx].head(5)

# Show 5 examples
print(misclassified[['title_left', 'title_right', 'brand_left', 'brand_right', 'price_left', 'price_right', 'label']])

misclassified.to_csv('misclassified_examples.csv', index=False)
