# Merging Phone Arena Data with IDC Data (IVs)
This notebook loads preprocessed IDC and Phone Arena data, applies name cleaning/matching rules,
performs replacements for known alias models (e.g., GOOGLENEXUS → LGNEXUS), and merges datasets for BLP modeling.

This generates : (Phone Arena data IDC(merged).csv)



In [None]:
import pandas as pd
import numpy as np

# Load the formatted IDC data and PhoneArena feature data
df_idc = pd.read_csv('/content/drive/MyDrive/OUTPUT DATA(FROM IDC)/Formatted data for BLP_1.csv')
df_phone = pd.read_csv('/content/drive/MyDrive/OUTPUT DATA(FROM IDC)/COMPLETE DATA FILEs/phone data from phonearena.csv')

In [None]:
# Drop irrelevant columns from phonearena
dropcols = [
    'Unnamed: 123', 'Tablet dock:', 'Broadcast Mobile TV:', 'TDMA:', 'Tablet dock:', '3D capture:',
    'Carrier locked:', 'ROM:', 'Headphones:', 'Radio:', 'Materials:', 'Folded:', 'Watch']  # example subset
df_phone_cleaned = df_phone.drop(columns=[col for col in dropcols if col in df_phone.columns], errors='ignore')

In [None]:
# Clean model names and remove tablets/watches
df_idc['Brand'] = df_idc['Brand'].replace({'LG Electronics': 'LG'})
df_idc['Model Name'] = (df_idc['Brand'] + df_idc['Model Name']).str.upper().str.replace(' ', '').str.replace("'", '')
df_phone_cleaned['Model Name'] = df_phone_cleaned['Model Name'].str.upper().str.replace(' ', '').str.replace("'", '')
df_phone_cleaned = df_phone_cleaned[~df_phone_cleaned['Model Name'].str.contains('WATCH', na=False)]

In [None]:
# Replace known aliases to match naming convention
alias_dict = {
    'GOOGLENEXUS4': 'LGNEXUS4', 'GOOGLENEXUS5': 'LGNEXUS5', 'GOOGLENEXUSX': 'LGNEXUSX',
    'SAMSUNGGALAXYNEXUS': 'SAMSUNGNEXUS', 'GOOGLENEXUS6': 'MOTOROLANEXUS6',
    'GOOGLENEXUS6P': 'HUAWEINEXUS6P', 'NOKIA6(2018)': 'NOKIA6.1',
    'BLACKBERRYCURVE3G': 'BLACKBERRYCURVE9300', 'HTCSNAPCDMA': 'HTCSNAPS511',
    'SAMSUNGBEHOLDII': 'SAMSUNGBEHOLD2'
}
df_phone_cleaned['Model Name'] = df_phone_cleaned['Model Name'].replace(alias_dict)

In [None]:
# Load fuzzy match file (already verified)
fuzzy_match = pd.read_csv('/content/drive/MyDrive/OUTPUT DATA(FROM IDC)/Fuzzy MATCH/fuzzy match (manually checked).csv')
fuzzy_match = fuzzy_match.dropna().drop(columns=['Unnamed: 0'], errors='ignore')
fuzzy_match['IDC'] = fuzzy_match['IDC'].str.upper().str.replace(' ', '')
fuzzy_match['Phone Arena'] = fuzzy_match['Phone Arena'].str.upper().str.replace(' ', '')

# Apply fuzzy match mapping to phonearena model names
fuzzy_dict = fuzzy_match.set_index('Phone Arena')['IDC'].to_dict()
df_phone_cleaned['Model Name'] = df_phone_cleaned['Model Name'].replace(fuzzy_dict)

In [None]:
# Final merge between IDC data and phonearena feature data
merged_df = pd.merge(df_idc, df_phone_cleaned, how='left', on='Model Name')
merged_df = merged_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Save final merged output
output_path = '/content/drive/MyDrive/OUTPUT DATA(FROM IDC)/Phone Arena data IDC(merged).csv'
merged_df.to_csv(output_path, index=False)
print(f"✅ Merged data saved to: {output_path}")