# Task
將 node label 進行lemmatization, and stemming
然而有可能出現無法對應id 的問題
解方如下：
1. node: 先把 label lemma 後, id 不改以便對應edge table
2. edge: edge_id 對應 node_id, 但將 edge_id 置換成 label
3. 如此即便 id不同也會對應至相同label, 因已把 node label lemma了

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Read Files

In [3]:
import os
import pandas as pd

path = os.getcwd()
scientometric_coauthor_path = path+'\\data\\scientometrics\\co-author\\'
scientometric_cooccur_path = path+'\\data\\scientometrics\\co-occurrence\\'
jasist_coauthor_path = path+'\\data\\jasist\\co-author\\'
jasist_cooccur_path = path+'\\data\\jasist\\co-occurrence\\'

def list_files_paths(files_path):
    files = os.listdir(files_path)
    network_file_paths = []
    mapfile_paths = []
    for file in files:
        print(files_path+file+'\\network.txt')
        print(files_path+file+'\\map.txt')
        network_file_paths.append(files_path+file+'\\network.txt')
        mapfile_paths.append(files_path+file+'\\map.txt')
    return files, network_file_paths, mapfile_paths

files, network_file_paths, mapfile_paths = list_files_paths(jasist_cooccur_path)

C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2015\network.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2015\map.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2016\network.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2016\map.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2017\network.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2017\map.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2018\network.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2018\map.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST\co-occurrence\jasist_2010-2019\network.txt
C:\Users\Liser\Desktop\linchengwei_link_prediction\data\JASIST

In [5]:
def file_transform(file):
    source_list, target_list, weight_list = [], [], []
    with open(file, 'r', encoding='utf-8') as f:
        f = f.read()
        for i, line in enumerate(f.split('\n')):
            for j, ele in enumerate(line.split('\t')):
                if i == 0:
                    ele = ele.replace('\ufeff', '')
                if j == 0:
                    source_list.append(ele)
                elif j== 1:
                    target_list.append(ele)
                elif j == 2:
                    weight_list.append(ele)
    source_list = source_list[:-1]
    return source_list, target_list, weight_list

def to_df(source_list, target_list, weight_list):
    
    df = pd.DataFrame(columns = ['source', 'target', 'weight'])
    df['source'] = source_list
    df['target'] = target_list
    df['weight'] = weight_list

    return df

In [6]:
df_nodes = []
for f in mapfile_paths:
    df_node = pd.read_csv(f, '\t')
    df_nodes.append(df_node)
# df_node = df_nodes[0]

In [7]:
df_edges = []
for f in network_file_paths:
    source_list, target_list, weight_list = file_transform(f)
    df_iter = to_df(source_list, target_list, weight_list)
    df_edges.append(df_iter)
# df_edge = df_edges[0]

# Label Lemmatization

In [143]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [20]:
import nltk
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokens = ['compute', 'computer', 'computed', 'computing']
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


In [1]:
def label_lemma(df_node):
    df_new_label = pd.DataFrame(columns=['id', 'label','LEMMA', 'STEM'])
    df_new_label['id'] = df_node['id']
    df_new_label['label'] = df_node['label']

    for i in range(len(df_new_label)):
        text = df_new_label.iloc[i]['label']
        doc = nlp(text)
        token_stem = ''
        token_lemma = ''
        for token in doc:
            if token.is_alpha:
                token_stem += stemmer.stem(token.lemma_)
                token_stem += ' '
                token_lemma += token.lemma_
                token_lemma += ' '
    #     print(i, token_lemma, '---------------->', token_stem)
        df_new_label.at[i, 'LEMMA'] = token_lemma
        df_new_label.at[i, 'STEM'] = token_stem
    return df_new_label

### 詞彙控制
0. label --> 4338
1. LEMMA --> 4094
2. STEMM --> 4003

In [37]:
start_i = 500
end_i = start_i+60
df_new_label[start_i:end_i]

Unnamed: 0,id,label,LEMMA,STEM
500,501,china-related research,china relate research,china relat research
501,502,china-us scientific collaboration,china us scientific collaboration,china us scientif collabor
502,503,chinese academic books,chinese academic book,chines academ book
503,504,chinese herb,chinese herb,chines herb
504,505,chinese immigrant,chinese immigrant,chines immigr
505,506,chinese interpreting studies,chinese interpreting study,chines interpret studi
506,507,chinese lineage,chinese lineage,chines lineag
507,508,chinese medicine,chinese medicine,chines medicin
508,509,chinese patent,chinese patent,chines patent
509,510,chinese universities,chinese university,chines univers


## edge change from id to label

In [105]:
df_edge = df_edges[0]

In [119]:
df_new_edges = []
def replace_id_to_label(df_new_label, df_edge):
    df_new_edge = pd.DataFrame(columns=['source', 'target', 'weight'])
    for i in range(len(df_edge)):
        label_id_source = int(df_edge['source'].loc[i])
        label_id_target = int(df_edge['target'].loc[i])
        source_label_STEM = df_new_label[df_new_label['id']==label_id_source]['STEM'].values[0]
        target_label_STEM = df_new_label[df_new_label['id']==label_id_target]['STEM'].values[0]
        df_new_edge.at[i, 'source'] = source_label_STEM
        df_new_edge.at[i, 'target'] = target_label_STEM
        df_new_edge.at[i, 'weight'] = df_edge['weight'].loc[i]
    
    return df_new_edge

In [145]:
df_lemma_nodes = []
df_new_edges = []
for i in range(len(df_nodes)):
    df_new_label = label_lemma(df_nodes[i])
    df_new_edge = replace_id_to_label(df_new_label, df_edges[i])
    
    df_lemma_nodes.append(df_new_label)
    df_new_edges.append(df_new_edge)

In [144]:
files

['jasist_2010-2015',
 'jasist_2010-2016',
 'jasist_2010-2017',
 'jasist_2010-2018',
 'jasist_2010-2019']

In [150]:
output_path = os.getcwd()+"\\output\\lemma files\\jasist_cooccur\\"
for i, v in enumerate(files):
    df_lemma_nodes[i].to_pickle(output_path+"{}_lemma_nodes.pkl".format(v))
    df_new_edges[i].to_pickle(output_path+"{}_lemma_edges.pkl".format(v))