# EDA and Baseline Models

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os

Datasets are put as directories into a directory called small_data.

In [70]:
ag_path = os.path.join('small_data', 'AmazonGoogle')
dblp1_path = os.path.join('small_data', 'DBLP-Scholar')
dblp2_path = os.path.join('small_data', 'DBLP-ACM')

# Amazon Google EDA

In [12]:
os.listdir(ag_path)

['Amazon.csv', 'Amzon_GoogleProducts_perfectMapping.csv', 'GoogleProducts.csv']

In [24]:
amazon_df = pd.read_csv(os.path.join(ag_path, 'Amazon.csv'), encoding='windows-1252')
ag_matching_df = pd.read_csv(os.path.join(ag_path, 'Amzon_GoogleProducts_perfectMapping.csv'), encoding='windows-1252')
google_df = pd.read_csv(os.path.join(ag_path, 'GoogleProducts.csv'), encoding='windows-1252')

In [22]:
amazon_df.head()

Unnamed: 0,id,title,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 - premier image pack (dvd-rom),,broderbund,0.0
1,b0006zf55o,ca international - arcserve lap/desktop oem 30pk,oem arcserve backup v11.1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99


In [27]:
amazon_df.shape

(1363, 5)

In [38]:
# missingness
for col in amazon_df.columns:
    print(col, amazon_df[col].isna().sum() / amazon_df.shape[0])

id 0.0
title 0.0
description 0.08437270726338958
manufacturer 0.0
price 0.0


In [23]:
google_df.head()

Unnamed: 0,id,name,description,manufacturer,price
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,38.99
1,http://www.google.com/base/feeds/snippets/1153...,superstart! fun with reading & writing!,fun with reading & writing! is designed to hel...,,8.49
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6.0 basic software,qb pos 6.0 basic retail mngmt software. for re...,intuit,637.99
3,http://www.google.com/base/feeds/snippets/1204...,math missions: the amazing arcade adventure (g...,save spectacle city by disrupting randall unde...,,12.95
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,805.99


In [30]:
google_df.shape

(3226, 5)

In [40]:
# missingness
for col in google_df.columns:
    print(col, google_df[col].isna().sum() / google_df.shape[0])

id 0.0
name 0.0
description 0.05920644761314321
manufacturer 0.9280843149411035
price 0.0


In [26]:
ag_matching_df.head()

Unnamed: 0,idAmazon,idGoogleBase
0,b000jz4hqo,http://www.google.com/base/feeds/snippets/1844...
1,b00004tkvy,http://www.google.com/base/feeds/snippets/1844...
2,b000g80lqo,http://www.google.com/base/feeds/snippets/1844...
3,b0006se5bq,http://www.google.com/base/feeds/snippets/1842...
4,b00021xhzw,http://www.google.com/base/feeds/snippets/1843...


In [32]:
ag_matching_df.shape

(1300, 2)

In [57]:
# missingness
for col in ag_matching_df.columns:
    print(col, ag_matching_df[col].isna().sum() / ag_matching_df.shape[0])

idAmazon 0.0
idGoogleBase 0.0


In [56]:
for col in ag_matching_df.columns:
    print(ag_matching_df[col].nunique())

1113
1291


In [332]:
temp_amazon_df = pd.DataFrame(columns = amazon_df.columns)
temp_google_df = pd.DataFrame(columns = google_df.columns)
for row in ag_matching_df.itertuples():
    amazon_id = row[1]
    google_id = row[2]
    temp_amazon_df = temp_amazon_df.append(amazon_df[amazon_df.id == amazon_id])
    temp_google_df = temp_google_df.append(google_df[google_df.id == google_id])

In [336]:
temp_amazon_df = temp_amazon_df.reset_index(drop = True)
new_amazon_cols = ['amazon_' + x for x in temp_amazon_df.columns]
temp_amazon_df = temp_amazon_df.rename(columns = {temp_amazon_df.columns[i]: new_amazon_cols[i] for i in range(len(temp_amazon_df.columns))})

In [338]:
temp_google_df = temp_google_df.reset_index(drop = True)
new_google_cols = ['google_' + x for x in temp_google_df.columns]
temp_google_df = temp_google_df.rename(columns = {temp_google_df.columns[i]: new_google_cols[i] for i in range(len(temp_google_df.columns))})

In [339]:
matched_df = pd.concat([temp_amazon_df, temp_google_df], axis = 1)

# Amazon Google Baseline

In [349]:
matched_df.shape

(1300, 10)

In [342]:
matched_df['amazon_title'] = matched_df['amazon_title'].str.lower()
matched_df['google_name'] = matched_df['google_name'].str.lower()

In [343]:
((matched_df['amazon_title'] == matched_df['google_name']) & (matched_df['amazon_price'] == matched_df['google_price'])).sum()

0

In [344]:
(matched_df['amazon_title'] == matched_df['google_name']).sum()

44

In [345]:
amazon_key_df = amazon_df.copy(deep = True)
google_key_df = google_df.copy(deep = True)
amazon_key_df = amazon_key_df.rename(columns = {amazon_key_df.columns[i]: new_amazon_cols[i] for i in range(len(amazon_key_df.columns))})
google_key_df = google_key_df.rename(columns = {google_key_df.columns[i]: new_google_cols[i] for i in range(len(google_key_df.columns))})
amazon_key_df['key'] = 0
google_key_df['key'] = 0

In [347]:
cross_df = amazon_key_df.merge(google_key_df, on = 'key', how = 'outer')

In [348]:
cross_df = cross_df.drop(columns = ['key'])
cross_df['amazon_title'] = cross_df['amazon_title'].str.lower()
cross_df['google_name'] = cross_df['google_name'].str.lower()

In [350]:
sampled_df = cross_df.sample(1300)

In [351]:
sampled_df.merge(matched_df, how = 'inner')

Unnamed: 0,amazon_id,amazon_title,amazon_description,amazon_manufacturer,amazon_price,google_id,google_name,google_description,google_manufacturer,google_price


In [352]:
sampled_df[((sampled_df['amazon_title'] == sampled_df['google_name']) & (sampled_df['amazon_price'] == sampled_df['google_price']))].shape

(0, 10)

In [353]:
sampled_df[sampled_df['amazon_title'] == sampled_df['google_name']].shape

(0, 10)

# DBLP-ACM EDA

In [265]:
os.listdir(dblp2_path)

['ACM.csv', 'DBLP-ACM_perfectMapping.csv', 'DBLP2.csv']

In [266]:
dblp2_df = pd.read_csv(os.path.join(dblp2_path, 'DBLP2.csv'), encoding='latin-1')
dblp_matching_df = pd.read_csv(os.path.join(dblp2_path, 'DBLP-ACM_perfectMapping.csv'), encoding="utf8")
acm_df = pd.read_csv(os.path.join(dblp2_path, 'ACM.csv'),encoding="utf8")

In [267]:
dblp2_df.head()

Unnamed: 0,id,title,authors,venue,year
0,journals/sigmod/Mackay99,Semantic Integration of Environmental Models f...,D. Scott Mackay,SIGMOD Record,1999
1,conf/vldb/PoosalaI96,Estimation of Query-Result Distribution and it...,"Viswanath Poosala, Yannis E. Ioannidis",VLDB,1996
2,conf/vldb/PalpanasSCP02,Incremental Maintenance for Non-Distributive A...,"Themistoklis Palpanas, Richard Sidle, Hamid Pi...",VLDB,2002
3,conf/vldb/GardarinGT96,Cost-based Selection of Path Expression Proces...,"Zhao-Hui Tang, Georges Gardarin, Jean-Robert G...",VLDB,1996
4,conf/vldb/HoelS95,Benchmarking Spatial Join Operations with Spat...,"Erik G. Hoel, Hanan Samet",VLDB,1995


In [268]:
dblp2_df.shape

(2616, 5)

In [269]:
# missingness
for col in dblp2_df.columns:
    print(col, dblp2_df[col].isna().sum() / dblp2_df.shape[0])

id 0.0
title 0.0
authors 0.0
venue 0.0
year 0.0


In [270]:
acm_df.head()

Unnamed: 0,id,title,authors,venue,year
0,304586,The WASA2 object-oriented workflow management ...,"Gottfried Vossen, Mathias Weske",International Conference on Management of Data,1999
1,304587,A user-centered interface for querying distrib...,"Isabel F. Cruz, Kimberly M. James",International Conference on Management of Data,1999
2,304589,"World Wide Database-integrating the Web, CORBA...","Athman Bouguettaya, Boualem Benatallah, Lily H...",International Conference on Management of Data,1999
3,304590,XML-based information mediation with MIX,"Chaitan Baru, Amarnath Gupta, Bertram Lud&#228...",International Conference on Management of Data,1999
4,304582,The CCUBE constraint object-oriented database ...,"Alexander Brodsky, Victor E. Segal, Jia Chen, ...",International Conference on Management of Data,1999


In [271]:
acm_df.shape

(2294, 5)

In [272]:
# missingness
for col in acm_df.columns:
    print(col, acm_df[col].isna().sum() / acm_df.shape[0])

id 0.0
title 0.0
authors 0.006102877070619006
venue 0.0
year 0.0


In [273]:
dblp_matching_df.head()

Unnamed: 0,idDBLP,idACM
0,conf/sigmod/SlivinskasJS01,375678
1,conf/sigmod/ChaudhuriDN01,375694
2,conf/sigmod/RinfretOO01,375669
3,conf/sigmod/BreunigKKS01,375672
4,conf/sigmod/JagadishJOT01,375687


In [274]:
dblp_matching_df.shape

(2224, 2)

In [275]:
for col in dblp_matching_df.columns:
    print(dblp_matching_df[col].nunique())

2224
2224


In [276]:
temp_dblp_df = pd.DataFrame(columns = dblp2_df.columns)
temp_acm_df = pd.DataFrame(columns = acm_df.columns)
for row in dblp_matching_df.itertuples():
    dblp_id = row[1]
    acm_id = row[2]
    temp_dblp_df = temp_dblp_df.append(dblp2_df[dblp2_df.id == dblp_id])
    temp_acm_df = temp_acm_df.append(acm_df[acm_df.id == acm_id])

In [277]:
temp_dblp_df = temp_dblp_df.reset_index(drop = True)
new_dblp_cols = ['dblp_' + x for x in temp_dblp_df.columns]
temp_dblp_df = temp_dblp_df.rename(columns = {temp_dblp_df.columns[i]: new_dblp_cols[i] for i in range(len(temp_dblp_df.columns))})

In [278]:
temp_acm_df = temp_acm_df.reset_index(drop = True)
new_acm_cols = ['acm_' + x for x in temp_acm_df.columns]
temp_acm_df = temp_acm_df.rename(columns = {temp_acm_df.columns[i]: new_acm_cols[i] for i in range(len(temp_acm_df.columns))})

In [279]:
matched_df = pd.concat([temp_dblp_df, temp_acm_df], axis = 1)

# DBLP-ACM Baseline

In [280]:
matched_df['dblp_title'] = matched_df['dblp_title'].str.lower()
matched_df['acm_title'] = matched_df['dblp_title'].str.lower()

In [281]:
((matched_df['dblp_title'] == matched_df['acm_title']) & (matched_df['dblp_year'] == matched_df['acm_year'])).sum()

2224

In [282]:
dblp_key_df = dblp2_df.copy(deep = True)
acm_key_df = acm_df.copy(deep = True)
dblp_key_df = dblp_key_df.rename(columns = {dblp_key_df.columns[i]: new_dblp_cols[i] for i in range(len(dblp_key_df.columns))})
acm_key_df = acm_key_df.rename(columns = {acm_key_df.columns[i]: new_acm_cols[i] for i in range(len(acm_key_df.columns))})
dblp_key_df['key'] = 0
acm_key_df['key'] = 0

In [283]:
cross_df = dblp_key_df.merge(acm_key_df, on = 'key', how = 'outer')

In [284]:
cross_df = cross_df.drop(columns = ['key'])
cross_df['dblp_title'] = cross_df['dblp_title'].str.lower()
cross_df['acm_title'] = cross_df['acm_title'].str.lower()

In [289]:
sampled_df = cross_df.sample(2224)

In [290]:
sampled_df.merge(matched_df, how = 'inner')

Unnamed: 0,dblp_id,dblp_title,dblp_authors,dblp_venue,dblp_year,acm_id,acm_title,acm_authors,acm_venue,acm_year


In [291]:
sampled_df[((sampled_df['dblp_title'] == sampled_df['acm_title']) & (sampled_df['dblp_year'] == sampled_df['acm_year']))].shape

(0, 10)

# DBLP-Scholar EDA

In [293]:
os.listdir(dblp1_path)

['DBLP-Scholar_perfectMapping.csv', 'DBLP1.csv', 'Scholar.csv']

In [294]:
dblp1_df = pd.read_csv(os.path.join(dblp1_path, 'DBLP1.csv'), encoding='latin-1')
dblp_matching_df = pd.read_csv(os.path.join(dblp1_path, 'DBLP-Scholar_perfectMapping.csv'), encoding="utf8")
scholar_df = pd.read_csv(os.path.join(dblp1_path, 'Scholar.csv'),encoding="utf8")

In [295]:
dblp1_df.head()

Unnamed: 0,id,title,authors,venue,year
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The ...,"M Rusinkiewicz, W Klas, T Tesch, J Wäsch, P Muth",VLDB,1995
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics...,"P Ammann, S Jajodia, I Ray",VLDB,1995
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Worksho...,,,2002


In [296]:
dblp1_df.shape

(2616, 5)

In [297]:
# missingness
for col in dblp1_df.columns:
    print(col, dblp1_df[col].isna().sum() / dblp1_df.shape[0])

id 0.0
title 0.0
authors 0.08333333333333333
venue 0.08065749235474007
year 0.0


In [298]:
scholar_df.head()

Unnamed: 0,id,title,authors,venue,year
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method ...,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates...,"TM Hammett, P Harmon, W Rhodes",see,
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and En...,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0


In [299]:
scholar_df.shape

(64263, 5)

In [300]:
# missingness
for col in scholar_df.columns:
    print(col, scholar_df[col].isna().sum() / scholar_df.shape[0])

id 0.0
title 0.0
authors 1.5561053794562967e-05
venue 0.23336912375706081
year 0.5413690615128457


In [301]:
dblp_matching_df.head()

Unnamed: 0,idDBLP,idScholar
0,conf/sigmod/AbadiC02,f2Lea-RN8dsJ
1,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,eBnT7lhV2LwJ
2,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,gBVNSFeS4P8J
3,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,VuY9Y49GqXgJ
4,conf/sigmod/AbiteboulBCMM03,AxpQwgyRyLgJ


In [302]:
dblp_matching_df.shape

(5347, 2)

In [303]:
for col in dblp_matching_df.columns:
    print(dblp_matching_df[col].nunique())

2408
5218


In [304]:
temp_dblp_df = pd.DataFrame(columns = dblp1_df.columns)
temp_scholar_df = pd.DataFrame(columns = scholar_df.columns)
for row in dblp_matching_df.itertuples():
    dblp_id = row[1]
    scholar_id = row[2]
    temp_dblp_df = temp_dblp_df.append(dblp2_df[dblp2_df.id == dblp_id])
    temp_scholar_df = temp_scholar_df.append(scholar_df[scholar_df.id == scholar_id])

In [305]:
temp_dblp_df = temp_dblp_df.reset_index(drop = True)
new_dblp_cols = ['dblp_' + x for x in temp_dblp_df.columns]
temp_dblp_df = temp_dblp_df.rename(columns = {temp_dblp_df.columns[i]: new_dblp_cols[i] for i in range(len(temp_dblp_df.columns))})

In [306]:
temp_scholar_df = temp_scholar_df.reset_index(drop = True)
new_scholar_cols = ['scholar_' + x for x in temp_scholar_df.columns]
temp_scholar_df = temp_scholar_df.rename(columns = {temp_scholar_df.columns[i]: new_scholar_cols[i] for i in range(len(temp_scholar_df.columns))})

In [307]:
matched_df = pd.concat([temp_dblp_df, temp_scholar_df], axis = 1)

# DBLP-Scholar Baseline

In [308]:
matched_df.shape

(5347, 10)

In [309]:
# high missingness in scholar_year column
matched_df.scholar_year.isna().sum()/matched_df.shape[0]

0.5172994202356461

In [310]:
matched_df['dblp_title'] = matched_df['dblp_title'].str.lower()
matched_df['scholar_title'] = matched_df['scholar_title'].str.lower()

In [311]:
((matched_df['dblp_title'] == matched_df['scholar_title']) & (matched_df['dblp_year'] == matched_df['scholar_year'])).sum()

1818

In [312]:
(matched_df['dblp_title'] == matched_df['scholar_title']).sum()

2552

In [331]:
2552/5347

0.47727697774452965

In [316]:
dblp_key_df = dblp2_df.copy(deep = True)
scholar_key_df = scholar_df.copy(deep = True)
dblp_key_df = dblp_key_df.rename(columns = {dblp_key_df.columns[i]: new_dblp_cols[i] for i in range(len(dblp_key_df.columns))})
scholar_key_df = scholar_key_df.rename(columns = {scholar_key_df.columns[i]: new_scholar_cols[i] for i in range(len(scholar_key_df.columns))})

In [317]:
dblp_key_df.shape

(2616, 5)

In [318]:
scholar_key_df.shape

(64263, 5)

In [322]:
sampled_df = pd.concat([dblp_key_df.sample(n = 5347, replace = True).reset_index(),
                        scholar_key_df.sample(n = 5347, replace = True).reset_index()])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [327]:
sampled_df['dblp_title'] = sampled_df['dblp_title'].str.lower()
sampled_df['scholar_title'] = sampled_df['scholar_title'].str.lower()

In [328]:
sampled_df.merge(matched_df, how = 'inner')

Unnamed: 0,dblp_authors,dblp_id,dblp_title,dblp_venue,dblp_year,index,scholar_authors,scholar_id,scholar_title,scholar_venue,scholar_year


In [329]:
sampled_df[((sampled_df['dblp_title'] == sampled_df['scholar_title']) & (sampled_df['dblp_year'] == sampled_df['scholar_year']))].shape

(0, 11)

In [330]:
sampled_df[sampled_df['dblp_title'] == sampled_df['scholar_title']].shape

(0, 11)