In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

In [None]:
base_path = '/Users/udaisingh/Downloads/kdd-cup-2013-author-disambiguation/dataRev2/'

In [None]:
paths = [os.path.join(base_path, x) for x in os.listdir(base_path)]

In [None]:
data = [pd.read_csv(x, nrows = 5) for x in paths]

In [None]:
for i in paths:
    print(i)

In [None]:
[display(x) for x in data]

In [None]:
class graphEmbedder:
    
    def __init__(self):
        self.entityNodes = set()
        self.Graphs = []
    
    def defineKeys(self,df,idCol):
        temp = set([str(x) for x in df[idCol]])
        self.entityNodes.update(temp)
    
    def embeddOrdinal(self,dfs,idCol,column,name,bins=10, equalBinSize = True):
        df = pd.concat([df[[idCol,column]] for df in dfs])
        if equalBinSize: temp = pd.qcut(df[column], q = bins)
        else: temp = temp = pd.cut(df[column], bins = bins)
        
        tempDf = df[[idCol]].copy(deep=True)
        tempDf['temp'] = temp
        graph = dict(tempDf.dropna().to_numpy())
        self.Graphs.append((name,graph))
        return True
    
    def embeddText(self,dfs,idCol, column,name, min_df = 0.01, method = 'BagOfWords'):
        df = pd.concat([df[[idCol,column]] for df in dfs])
        temp = df[[idCol, column]]
        temp = temp.dropna()
        if method == 'BagOfWords': vectorizer = CountVectorizer(min_df=min_df, ngram_range = (1,1))
        
        output = CV.fit_transform(temp[column]).toarray()
        tokens = CV.get_feature_names()
        idVal = list(temp[idCol])
        
        graph = {}
        for i in range(output.shape[0]):
            words = []
            for j in range(output.shape[1]):
                if output[i][j] > 0:
                    words.append(tokens[j])
            graph[idVal[i]] = words
        
        self.Graphs.append((name,graph))
        return True
    
    def embeddCategorical(self,dfs,idCol,column,name):
        df = pd.concat([df[[idCol,column]] for df in dfs])
        graph = dict(df[[idCol,column]].to_numpy())
        self.Graphs.append((name,graph))
        return True
    
    def defineTruth(self, df):
        graph = dict(df.to_numpy())
        self.Graphs.append(('ground_truth',graph))
        return True
    
    def saveGraph(self, method = 'csv', fname = os.path.join(os.getcwd(),'heteroGraph.csv')):
        df = pd.DataFrame(columns = ['source', 'target','type'])
        for graph in self.Graphs:
            name = graph[0]
            links = graph[1]
            data = self._graphFixer(links)
            temp = pd.DataFrame(data = data, columns = ['source', 'target'])
            temp['type'] = name
            
            df = pd.concat([df, temp])
        temp = [str(x) for x in self.entityNodes]
        df_temp = pd.DataFrame(columns = ['source', 'target', 'type'])
        df_temp['source'] = temp
        df_temp['target'] = None
        df_temp['type'] = None
        df = pd.concat([df, df_temp])
        df.to_csv(os.path.join(os.getcwd(),fname), index = False)
    
    def _graphFixer(self, dictionary):
        temp = []
        for key, value in dictionary.items():
            if type(value)==list:
                for v in value:
                    temp.append((key,v))
            else:
                temp.append((key,value))
        return temp

## Abt-Buy Dataset

In [None]:
def fixPrice(x):
    if not (type(x)== float):
        try:
            return re.findall("\d+\.\d+", x)[0]
        except:
            return np.nan
    else:
        return x

In [None]:
pt1 = "../data/Abt-Buy/Abt.csv"
pt2 = "../data/Abt-Buy/Buy.csv"
pt3 = "../data/Abt-Buy/abt_buy_perfectMapping.csv"

Abt = pd.read_csv(pt1, engine = 'python')
Buy = pd.read_csv(pt2, engine = 'python')
truth = pd.read_csv(pt3)

Abt['price'] = pd.to_numeric(Abt.price.apply(fixPrice))
Buy['price'] = pd.to_numeric(Buy.price.apply(fixPrice))

In [None]:
display(Abt.head())
display(Buy.head())

In [None]:
gE = graphEmbedder()

In [None]:
pd.cut(Abt.price, bins = 5, labels = False)

In [None]:
gE.defineKeys(Abt, 'id')
gE.defineKeys(Buy, 'id')

In [None]:
gE.defineTruth(truth)

In [None]:
gE.embeddText([Abt,Buy], 'id','name','name', min_df=0)

In [None]:
gE.embeddText([Abt,Buy], 'id','description','description', min_df = 0.05)

In [None]:
gE.embeddOrdinal([Abt,Buy],'id','price','price', bins = 5)

In [None]:
gE.embeddCategorical([Buy], 'id', 'manufacturer', 'manufacturer')

In [None]:
gE.saveGraph()

## DBLP-ACM Datset

In [None]:
pt1 = "../data/DBLP-ACM/DBLP2.csv"
pt2 = "../data/DBLP-ACM/ACM.csv"
pt3 = "../data/DBLP-ACM/DBLP-ACM_perfectMapping.csv"

In [None]:
dblp2 = pd.read_csv(pt1, engine = 'python')
acm = pd.read_csv(pt2)
matchings = pd.read_csv(pt3)

In [None]:
display(dblp2.head())
display(acm.head())

In [None]:
gE = graphEmbedder()

In [None]:
gE.defineKeys(dblp2, 'id')
gE.defineKeys(acm, 'id')
gE.defineTruth(matchings)

In [None]:
gE.embeddText([dblp2,acm], 'id','title','title', min_df=0)
gE.embeddText([dblp2,acm], 'id','authors','authors', min_df=0)

In [None]:
gE.embeddCategorical([dblp2,acm], 'id', 'venue', 'venue')
gE.embeddCategorical([dblp2,acm], 'id', 'year', 'year')

In [None]:
gE.saveGraph()

## DBLP-Scholar Dataset

In [None]:
pt1 = "../data/DBLP-Scholar/DBLP1.csv"
pt2 = "../data/DBLP-Scholar/Scholar.csv"
pt3 = "../data/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv"

In [None]:
dblp1 = pd.read_csv(pt1, engine = 'python')
scholar = pd.read_csv(pt2)
matchings = pd.read_csv(pt3)

In [None]:
display(dblp1.head())
display(scholar.head())

In [None]:
gE = graphEmbedder()

In [None]:
gE.defineKeys(dblp1, 'id')
gE.defineKeys(scholar, 'id')
gE.defineTruth(matchings)

In [None]:
gE.embeddText([dblp1,scholar], 'id','title','title', min_df=0.01)
gE.embeddText([dblp1,scholar], 'id','authors','authors', min_df=0.01)
gE.embeddText([dblp1,scholar], 'id','venue','venue', min_df=0)

In [None]:
gE.embeddCategorical([dblp1,scholar], 'id', 'year', 'year')

In [None]:
gE.saveGraph()

# Amazon-Google Product Dataset

In [None]:
pt1 = "../data/Amazon-GoogleProducts/Amazon.csv"
pt2 = "../data/Amazon-GoogleProducts/GoogleProducts.csv"
pt3 = "../data/Amazon-GoogleProducts/Amzon_GoogleProducts_perfectMapping.csv"

In [None]:
amazon = pd.read_csv(pt1, engine = 'python')
google = pd.read_csv(pt2, engine = 'python')
matchings = pd.read_csv(pt3)

In [None]:
display(amazon.head())
display(google.head())

In [None]:
google = google.rename(columns={'name':'title'})

In [None]:
google.price = pd.to_numeric(google.price.apply(fixPrice))

In [None]:
gE = graphEmbedder()

In [None]:
gE.defineKeys(amazon, 'id')
gE.defineKeys(google, 'id')
gE.defineTruth(matchings)

In [None]:
gE.embeddText([amazon,google], 'id','title','title', min_df=0.01)
gE.embeddText([amazon,google], 'id','description','description', min_df=0)

In [None]:
gE.embeddCategorical([amazon,google], 'id','manufacturer','manufacturer')

In [None]:
gE.embeddOrdinal([amazon,google], 'id','price','price')

In [None]:
gE.saveGraph()