# Load csvs

In [1]:
import pandas as pd
from pandas import DataFrame as DF
import json

In [5]:
dfs = []
for i in range(1, 5):
 dfs.append(pd.read_csv(f"dblp/dblp-{i}.csv"))

df = pd.concat(dfs)
df = df.rename(columns={"Unnamed: 0": "id"})
train = pd.read_csv("dblp/train.csv")
merged = train.merge(df, left_on="key1", right_on="pkey").merge(df, left_on="key2", right_on="pkey")
merged["prediction"] = False
merged.loc[merged["id_x"] == merged["id_y"], "prediction"] = True


# accuracy
print(f'accuracy: {(merged.prediction == merged.label).mean()}')
# recall
print(f"recall: {merged[merged.label].prediction.mean()}")
# precision
print(f"recall: {merged[merged.prediction].label.mean()}")
len(df.index)


accuracy: 0.558705469141997
recall: 0.8
recall: 0.5362786745964316


Unnamed: 0,id,pauthor,peditor,ptitle,pyear,paddress,ppublisher,pseries,pid,pkey,ptype_id,pjournal_id,pbooktitle_id,pjournalfull_id,pbooktitlefull_id,partition
0,4,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,,Improving the Tolerance of Pipeline Based Circ...,-2007,,,,180843,conf/dft/SemiaoRVSTT07,1,0,4,0,4,1
1,7,Patrice Caire,,A Normative Multi-Agent Systems Approach to th...,-2007,,,,162991,conf/dagstuhl/Caire07,2,0,7,0,7,1
2,10,Sundeep B|Andrew Thangaraj,,Self-Orthogonality of q-Ary Images of qm-Ary C...,2007,,,,2261406,journals/tit/BT07,0,2,9,2,9,1
3,18,Gerardo Pardo-Castellote,,OMG Data-Distribution Service: Architectural O...,-2003,,,,349720,conf/icdcsw/Pardo-Castellote03,1,0,11,0,11,1
4,19,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,,Structural Consistency: Enabling XML Keyword S...,2009,,,,1922328,journals/corr/abs-0911-4329,3,5,5,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4251,9975,Online bin packing with arbitrary release times.,,Yongqiang Shi|Deshi Ye,2008,,,,2245299,journals/tcs/ShiY08,0,12,5,12,5,4
4252,9978,Chew Lim Tan|Henry Wai Kit Chia,,Neural Logic Network Learning using Genetic Pr...,2001,,,,460233,conf/ijcai/TanC01,2,0,71,0,71,4
4253,9981,Sue Newell|Jacky Swan|Joseph Weiss,,Project Management: Minitrack Introduction.,2004,,,,289610,conf/hicss/NewellSW04,1,0,295,0,295,4
4254,9988,Martijn Hendriks|Barend van den Nieuwelaar|Fri...,,Model checker aided design of a controller for...,-2006,,,,2224414,journals/sttt/HendriksNV06,3,374,5,374,5,4


# Load jsons

In [4]:
def load_json_to_frame(filename):
    with open(filename) as f:
        return DF(json.load(f)).set_index("id")

title = load_json_to_frame("dblp/pbooktitle.json")
title_full = load_json_to_frame("dblp/pbooktitlefull.json")
journal = load_json_to_frame("dblp/pjournal.json")
journal_full = load_json_to_frame("dblp/pjournalfull.json")
source_type = load_json_to_frame("dblp/ptype.json")

def replace_id_with_value(df, id_to_value, id_col_name, value_col_name):
    if id_col_name not in df:
        # make this operation idempotent
        return df
    df = df.merge(id_to_value, how='left', left_on=id_col_name, right_on="id")
    df = df.rename(columns={"name": value_col_name})
    df = df.drop(columns=[id_col_name])
    return df

def drop_cols_with_all_nans(df):
    for col in df:
        if df[col].isna().all():
            print(f"dropping column {col} since it contains only nans.")
            df = df.drop(columns=[col])
    return df

df = replace_id_with_value(df, title, "pbooktitle_id", "book_title")
df = replace_id_with_value(df, title_full, "pbooktitlefull_id", "book_title_full")
df = replace_id_with_value(df, journal, "pjournal_id", "journal")
df = replace_id_with_value(df, journal_full, "pjournalfull_id", "journal_full")
df = replace_id_with_value(df, source_type, "ptype_id", "source_type")
df = drop_cols_with_all_nans(df)
df

dropping column peditor since it contains only nans.
dropping column paddress since it contains only nans.


Unnamed: 0,id,pauthor,ptitle,pyear,ppublisher,pseries,pid,pkey,partition,book_title,book_title_full,journal,journal_full,source_type
0,4,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...,-2007,,,180843,conf/dft/SemiaoRVSTT07,1,DFT,nán,,,inproceedings
1,7,Patrice Caire,A Normative Multi-Agent Systems Approach to th...,-2007,,,162991,conf/dagstuhl/Caire07,1,Normative Multi-agent Systems,Éúrớpéán Grid Cớnféréncé,,,inprớcéédings
2,10,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...,2007,,,2261406,journals/tit/BT07,1,WiMớb,ACM Symposium on Parallel Algorithms and Archi...,IEEE Transactions on Information Theory,International Journal of Ambient Computing and...,árticlé
3,18,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...,-2003,,,349720,conf/icdcsw/Pardo-Castellote03,1,ICDCS Workshops,International Agent Technology Conference,,,inproceedings
4,19,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...,2009,,,1922328,journals/corr/abs-0911-4329,1,,Messung,CoRR,International Journal of Wireless Information ...,article
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17160,9975,Online bin packing with arbitrary release times.,Yongqiang Shi|Deshi Ye,2008,,,2245299,journals/tcs/ShiY08,4,,Messung,Theor. Comput. Sci.,ACM Transactions on Computation Theory (TOCT),árticlé
17161,9978,Chew Lim Tan|Henry Wai Kit Chia,Neural Logic Network Learning using Genetic Pr...,2001,,,460233,conf/ijcai/TanC01,4,IJCAI,inroads (ACM SIGCSE Bulletin)|ACM SIGCSE Bulletin,,,inprớcéédings
17162,9981,Sue Newell|Jacky Swan|Joseph Weiss,Project Management: Minitrack Introduction.,2004,,,289610,conf/hicss/NewellSW04,4,HICSS,Grid and Cooperative Computing,,,inproceedings
17163,9988,Martijn Hendriks|Barend van den Nieuwelaar|Fri...,Model checker aided design of a controller for...,-2006,,,2224414,journals/sttt/HendriksNV06,4,,Messung,STTT,,article


In [162]:
# view = train.merge(df, left_on="key1", right_on="pkey").merge(df, left_on="key2", right_on="pkey")
# view = view.drop(columns=["Unnamed: 0","key1",	"key2"])
# view.sort_index(axis=1).to_csv("view.csv")


In [163]:

def swap_on_condition(df, cond, col1, col2):
    df = df.copy()
    df.loc[cond(df), [col1, col2]] = (
        df.loc[cond(df), [col2, col1]].values)
    return df




title_contains_pipe_cond = lambda df: df["ptitle"].str.contains("\|")
author_contains_pipe_cond = lambda df: df["pauthor"].str.contains("\|")
author_longer_than_title_cond = lambda df: df["pauthor"].str.len() > df["ptitle"].str.len()


condition = lambda df: title_contains_pipe_cond(df) | \
(~title_contains_pipe_cond(df) & ~author_contains_pipe_cond(df) & author_longer_than_title_cond(df))

df = swap_on_condition(df, condition, "pauthor", 'ptitle')
df

Unnamed: 0,id,pauthor,ptitle,pyear,ppublisher,pseries,pid,pkey,partition,book_title,book_title_full,journal,journal_full,source_type
0,4,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...,-2007,,,180843,conf/dft/SemiaoRVSTT07,1,DFT,nán,,,inproceedings
1,854,Wei Zhang 0002,Computing Cache Vulnerability to Transient Err...,-2005,,,180734,conf/dft/Zhang05,1,DFT,nán,,,inproceedings
2,8474,Fengming Zhang|Young-Jun Lee|T. Kane|Luca Schi...,A Digital and Wide Power Bandwidth H-Field Gen...,2003,,,180743,conf/dft/ZhangLKSMKMLMP03,1,DFT,nán,,,inproceedings
3,4914,Bing Qiu|Yvon Savaria|Meng Lu|Chunyan Wang|Cla...,Yield Modeling of a WSI Telecom Router Archite...,-2002,,,180593,conf/dft/QiuSLWT02,4,DFT,nán,,,inproceedings
4,5135,Hossein Asadi|Mehdi Baradaran Tahoori,Soft Error Modeling and Protection for Sequent...,-2005,,,180280,conf/dft/AsadiT05,4,DFT,nán,,,inproceedings
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10511,9051,Ian H. Witten|Alistair Moffat|Timothy C. Bell,Managing Gigabytes: Compressing and Indexing D...,1999,Morgan Kaufmann,,2303,books/mk/WittenMB99,3,,Messung,,,book
10512,9951,Meike Klettke,Akquisition von Integritätsbedingungen in Date...,-1998,"Infix Verlag, St. Augustin, Germany",DISDBIS,1772,books/infix/Klettke98,4,,Messung,,,book
10513,9581,Willem P. de Roever|Frank S. de Boer|Ulrich Ha...,Concurrency Verification: Introduction to Comp...,2001,Cambridge University Press,Cambridge Tracts in Theoretical Computer Science,529,books/cu/RoeverBH2001,1,nán,,,,book
10514,376,Constantinos Kotopoulos|Nikolay B. Likhanov|Ra...,Asymptotic Analysis of the GPS System Fed by H...,2001,,,472490,conf/infocom/KotopoulosLM01,2,INFOCOM,Languages,,,bớớk
