# XML2DataFrame

In [None]:
#Convert XML data to DataFrame Data
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import xml.etree.ElementTree as ET

#parameter
dataset = 'physics'
base_path = '/data/pengyu/tag_rec/%s/'%dataset
data_path = '/data/pengyu/tag_rec/%s/Posts.xml'%dataset

def xml2df(data_path):
    '''
    Input: XML data path
    Output: Dataframe data that we need
    '''
    tree = ET.ElementTree(file=data_path)
    root = tree.getroot()
    body=[]
    title=[]
    tag=[]
    userid=[]
    for child_of_root in root:
#         print(child_of_root.attrib)
#         break
        title.append(np.nan if child_of_root.get('Title')==None else child_of_root.attrib['Title'])
        body.append(np.nan if child_of_root.get('Body')==None else child_of_root.attrib['Body'])
        tag.append(np.nan if child_of_root.get('Tags')==None else child_of_root.attrib['Tags'])
        userid.append(np.nan if child_of_root.get('OwnerUserId')==None else child_of_root.attrib['OwnerUserId'])
    df = pd.DataFrame(
        {
          'title':title,
          'body':body,
          'tag':tag,
          'userid':userid
        }
    )
    return df

df = xml2df(data_path)
df=df.dropna()
df['body'] = df['title']+df['body']
user_num = len(df['userid'].drop_duplicates())
print('User Number：', user_num)
df = df[['body','tag']]
df = shuffle(df, random_state=42)
df = df.iloc[:,:]
print('original data Number：', len(df))

# Remove Html Token

In [None]:
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 

wnl = WordNetLemmatizer() 
sw=stopwords.words('english')

def src_remove_html(sentence):
    temp = sentence
    temp = re.sub(r'<[^>]+>', ' ',str(temp))#delete <*>
    temp = re.sub(r'\n', ' ',str(temp))#delete \n
    temp = re.sub(r'\ \d+\ ', ' ',str(temp))#remove int number
    temp = re.sub(r'\ \d+\.\d+\ ', ' ',str(temp))#remove float number
    temp = re.sub(r'\s+[a-zA-Z]\s+', ' ',str(temp))#remove singel letter  
    temp = re.sub(r'[\.,!?]\s', ' ',str(temp))#remove , .for sentence 
    temp = re.sub(r'\s[\(\[]|[\)\}]\s', ' ',str(temp))#remove ()[]for sentence 
    temp = re.sub(r'\:\s', ' ',str(temp))#remove :for sentence 
    temp = re.sub(r'\s+', ' ',str(temp))#merge many ' '
    temp = temp.lower()
    doc = temp.split()
    temp = [wnl.lemmatize(word) for word in doc if word not in sw]
    temp = ' '.join(temp)   
    return temp

def trg_remove_symbol(sentence):
    temp = sentence
    #temp = re.sub(r'[<>]', ' ',str(temp))
    temp = re.sub(r'><', ';',str(temp))
    temp = re.sub(r'[<>]', '',str(temp))
    return temp    

src = []
trg = []

for src_line, trg_line in df.values:
    src_line = src_remove_html(src_line)    
    trg_line = trg_remove_symbol(trg_line)
    #delete Na row
    if len(src_line)<2 or len(trg_line)<1:
        print('=========================================')
        print(src_line)
        print(trg_line)
        continue
    src.append(src_line)
    trg.append(trg_line)
    
assert len(src) == len(trg), \
    'the number of records in source and target are not the same'    
print('data length is %d'%len(trg))

# split and save

In [None]:
data_length = len(src)
train_length = int(data_length*.8)
valid_length = int(data_length*.9)
train_src, valid_src, test_src = src[:train_length], src[train_length:valid_length],\
                                 src[valid_length:]
train_trg, valid_trg, test_trg = trg[:train_length], trg[train_length:valid_length], \
                                 trg[valid_length:]

# i = 177
# print(df.iloc[i,0])
# print(src[i])
# print(tokenized_src[i])
# print(trg[i])
# print(tokenized_trg[i])

In [4]:
def save_data(data, name):
    with open(base_path+'%s.txt'%name, 'w') as f:
        f.writelines("%s\n" % line for line in data)    

save_data(train_src, 'train_src')
save_data(valid_src, 'valid_src')
save_data(test_src, 'test_src')
save_data(train_trg, 'train_trg')
save_data(valid_trg, 'valid_trg')
save_data(test_trg, 'test_trg')

# dictionary

In [5]:
import os
dataset = 'physics'
vocab_size = 50000
#tag_size = 
bow_vocab = 10000
max_src_len = 200
max_trg_len = 5

base_path = '/data/pengyu/tag_rec/%s/'%dataset

train_src = base_path + 'train_src.txt'
train_trg = base_path + 'train_trg.txt'
valid_src = base_path + 'valid_src.txt'
valid_trg = base_path + 'valid_trg.txt'
test_src = base_path + 'test_src.txt'
test_trg = base_path + 'test_trg.txt'



In [6]:
def read_src_trg_files(tag="train"):
    '''
    Read data according to the tag (train/valid/test), return a list of (src, trg) pairs
    '''
    if tag == "train":
        src_file = train_src
        trg_file = train_trg
    elif tag == "valid":
        src_file = valid_src
        trg_file = valid_trg
    else:
        assert tag == "test"
        src_file = test_src
        trg_file = test_trg

    tokenized_src = []
    tokenized_trg = []

    for src_line, trg_line in zip(open(src_file, 'r'), open(trg_file, 'r')):
        # process src and trg line
        src_word_list = src_line.strip().split(' ')
        trg_list = trg_line.strip().split(';')  # a list of target sequences
        #trg_word_list = [trg.strip().split(' ') for trg in trg_list]

        # Truncate the sequence if it is too long
        src_word_list = src_word_list[:max_src_len]
        trg_word_list = trg_list[:max_trg_len]

        # Append the lines to the data
        tokenized_src.append(src_word_list)
        tokenized_trg.append(trg_word_list)

    assert len(tokenized_src) == len(tokenized_trg), \
        'the number of records in source and target are not the same'

    tokenized_pairs = list(zip(tokenized_src, tokenized_trg))
    print("Finish reading %d lines of data from %s and %s" % (len(tokenized_src), src_file, trg_file))
    return tokenized_pairs

# Tokenize training data, return a list of tuple, (src_word_list, [trg_1_word_list, trg_2_word_list, ...])
tokenized_train_pairs = read_src_trg_files("train")

Finish reading 139085 lines of data from /data/pengyu/tag_rec/physics/train_src.txt and /data/pengyu/tag_rec/physics/train_trg.txt


In [7]:
from collections import Counter

def build_vocab(tokenized_src_trg_pairs):
    '''
    Build the vocabulary from the training (src, trg) pairs
    :param tokenized_src_trg_pairs: list of (src, trg) pairs
    :return: word2idx, idx2word, token_freq_counter
    '''
    token_freq_counter = Counter()
    token_freq_counter_tag = Counter()
    for src_word_list, trg_word_lists in tokenized_src_trg_pairs:
        token_freq_counter.update(src_word_list)
        token_freq_counter_tag.update(trg_word_lists)

    # Discard special tokens if already present
    special_tokens = ['<pad>', '<unk>']
    num_special_tokens = len(special_tokens)

    for s_t in special_tokens:
        if s_t in token_freq_counter:
            del token_freq_counter[s_t]

    word2idx = dict()
    idx2word = dict()
    for idx, word in enumerate(special_tokens):
        # '<pad>': 0, '<bos>': 1, '<eos>': 2, '<unk>': 3
        word2idx[word] = idx
        idx2word[idx] = word

    sorted_word2idx = sorted(token_freq_counter.items(), key=lambda x: x[1], reverse=True)

    sorted_words = [x[0] for x in sorted_word2idx]

    for idx, word in enumerate(sorted_words):
        word2idx[word] = idx + num_special_tokens

    for idx, word in enumerate(sorted_words):
        idx2word[idx + num_special_tokens] = word

    tag2idx = dict()
    idx2tag = dict()

    sorted_tag2idx = sorted(token_freq_counter_tag.items(), key=lambda x: x[1], reverse=True)

    sorted_tags = [x[0] for x in sorted_tag2idx]

    for idx, tag in enumerate(sorted_tags):
        tag2idx[tag] = idx

    for idx, tag in enumerate(sorted_tags):
        idx2tag[idx] = tag        
             
    return word2idx, idx2word, token_freq_counter, tag2idx, idx2tag

# Build vocabulary from training src and trg
print("Building vocabulary from training data")
word2idx, idx2word, token_freq_counter, tag2idx, idx2tag = build_vocab(tokenized_train_pairs)
print("Total vocab_size: %d, predefined vocab_size: %d" % (len(word2idx), vocab_size))
print("Total tag_size: %d" %len(tag2idx))

Building vocabulary from training data
Total vocab_size: 600758, predefined vocab_size: 50000
Total tag_size: 893


In [8]:
import gensim

def make_bow_dictionary(tokenized_src_trg_pairs, base_path, bow_vocab):
    '''
    Build bag-of-word dictionary from tokenized_src_trg_pairs
    :param tokenized_src_trg_pairs: a list of (src, trg) pairs
    :param data_dir: data address, for distinguishing Weibo/Twitter/StackExchange
    :param bow_vocab: the size the bow vocabulary
    :return: bow_dictionary, a gensim.corpora.Dictionary object
    '''
    doc_bow = []
    tgt_set = set()

    for src, tgt in tokenized_src_trg_pairs:
        cur_bow = []
        cur_bow.extend(src)
        cur_bow.extend(tgt)
        doc_bow.append(cur_bow)
        
    bow_dictionary = gensim.corpora.Dictionary(doc_bow)
    # Remove single letter or character tokens
    len_1_words = list(filter(lambda w: len(w) == 1, bow_dictionary.values()))
    bow_dictionary.filter_tokens(list(map(bow_dictionary.token2id.get, len_1_words)))

    def read_stopwords(fn):
        return set([line.strip() for line in open(fn, encoding='utf-8') if len(line.strip()) != 0])

    # Read stopwords from file (bow vocabulary should not contain stopwords)
    STOPWORDS = gensim.parsing.preprocessing.STOPWORDS
    #stopwords1 = read_stopwords(base_path+"stopwords/stopwords.en.txt")
    #stopwords2 = read_stopwords(base_path+"stopwords/stopwords.SE.txt")
    final_stopwords = set(STOPWORDS)#.union(stopwords1).union(stopwords2)

    bow_dictionary.filter_tokens(list(map(bow_dictionary.token2id.get, final_stopwords)))

    print("The original bow vocabulary: %d" % len(bow_dictionary))
    bow_dictionary.filter_extremes(no_below=3, keep_n=bow_vocab)
    bow_dictionary.compactify()
    bow_dictionary.id2token = dict([(id, t) for t, id in bow_dictionary.token2id.items()])
    # for debug
    sorted_dfs = sorted(bow_dictionary.dfs.items(), key=lambda x: x[1], reverse=True)
    sorted_dfs_token = [(bow_dictionary.id2token[id], cnt) for id, cnt in sorted_dfs]
    print('The top 50 non-stop-words: ', sorted_dfs_token[:200])
    return bow_dictionary

# Build bag-of-word dictionary from training data
print("Building bow dictionary from training data")
bow_dictionary = make_bow_dictionary(tokenized_train_pairs, base_path, bow_vocab)
print("Bow dict_size: %d after filtered" % len(bow_dictionary))

Building bow dictionary from training data
The original bow vocabulary: 600653
The top 50 non-stop-words:  [('question', 37446), ('know', 29736), ('like', 25994), ('energy', 24199), ('time', 24129), ("i'm", 22850), ('field', 22588), ('equation', 22317), ('understand', 20789), ('point', 19024), ('mean', 17707), ('given', 17707), ('quantum-mechanics', 17470), ('particle', 17378), ('force', 17363), ('way', 17356), ('case', 16653), ('state', 16480), ('mass', 16215), ('example', 16112), ('problem', 15481), ('homework-and-exercises', 15108), ('different', 15105), ('use', 14224), ('possible', 14176), ('answer', 13681), ('quantum', 13621), ('following', 13397), ('space', 13103), ('constant', 13006), ('$$', 12993), ('theory', 12900), ('light', 12879), ('think', 12760), ('physic', 12752), ('term', 12595), ('function', 12540), ('velocity', 12334), ('work', 12280), ('change', 12256), ('newtonian-mechanics', 11728), ('need', 11199), ('consider', 10968), ('correct', 10926), ('electromagnetism', 1082

In [9]:
import pickle

def save_dict(mydict, dict_name):
    file_name=base_path+dict_name
    f=open(file_name,'wb')
    pickle.dump(mydict,f)
    f.close()
    print('Dict %s Saved'%dict_name)
    
save_dict(word2idx, 'word2idx')
save_dict(idx2word, 'idx2word')
save_dict(tag2idx, 'tag2idx')
save_dict(idx2tag, 'idx2tag')
save_dict(token_freq_counter, 'token_freq_counter')
save_dict(bow_dictionary, 'bow_dictionary')

Dict word2idx Saved
Dict idx2word Saved
Dict tag2idx Saved
Dict idx2tag Saved
Dict token_freq_counter Saved
Dict bow_dictionary Saved


# generate final data

In [10]:
import pickle
vocab_size = 50000
#tag_size = 
bow_vocab = 10000
max_src_len = 200
max_trg_len = 5

def load_dict(dict_name):
    file_name=base_path+dict_name
    f=open(file_name,'rb')
    mydict=pickle.load(f)
    f.close()
    return mydict

bow_dictionary = load_dict('bow_dictionary')
word2idx = load_dict('word2idx')
tag2idx = load_dict('tag2idx')

In [11]:
import numpy as np

def read_src_trg_files(tag="train"):
    '''
    Read data according to the tag (train/valid/test), return a list of (src, trg) pairs
    '''
    if tag == "train":
        src_file = train_src
        trg_file = train_trg
    elif tag == "valid":
        src_file = valid_src
        trg_file = valid_trg
    else:
        assert tag == "test"
        src_file = test_src
        trg_file = test_trg

    tokenized_src = []
    tokenized_trg = []
    avg_post = []
    avg_tag = []

    for src_line, trg_line in zip(open(src_file, 'r'), open(trg_file, 'r')):
        # process src and trg line
        src_word_list = src_line.strip().split(' ')
        trg_list = trg_line.strip().split(';')  # a list of target sequences

        # Truncate the sequence if it is too long
        avg_post.append(len(src_word_list))
        avg_tag.append(len(trg_list))
        src_word_list = src_word_list[:max_src_len]
        trg_word_list = trg_list[:max_trg_len]

        # Append the lines to the data
        tokenized_src.append(src_word_list)
        tokenized_trg.append(trg_word_list)

    assert len(tokenized_src) == len(tokenized_trg), \
        'the number of records in source and target are not the same'

    tokenized_pairs = list(zip(tokenized_src, tokenized_trg))
    print("Finish reading %d lines of data from %s and %s" % (len(tokenized_src), src_file, trg_file))
    print('avg_post',np.mean(avg_post))
    print('avg_tag',np.mean(avg_tag))
    return tokenized_pairs

def build_dataset(src_trgs_pairs, word2idx, tag2idx, bow_dictionary, tag="train"):
    '''
    build train/valid/test dataset
    '''
    text = []
    label = []
    bow = [] 
    for idx, (source, targets) in enumerate(src_trgs_pairs):
        src = [word2idx[w] if w in word2idx and word2idx[w] < vocab_size
               else word2idx['<unk>'] for w in source]
        trg = [tag2idx[w] for w in targets if w in tag2idx]
        src_bow = bow_dictionary.doc2bow(source)
        text.append(src)
        label.append(trg)
        bow.append(src_bow)
        
    bow = BowFeature(bow, bow_dictionary)
    text = padding(text)
    label =  [encode_one_hot(inst, len(tag2idx), label_from=0) for inst in label] 
    return np.array(text), np.array(label), np.array(bow)

def padding(input_list):
    input_list_lens = [len(l) for l in input_list]
    max_seq_len = max(input_list_lens)
    padded_batch = word2idx['<pad>'] * np.ones((len(input_list), max_seq_len), dtype=np.int)

    for j in range(len(input_list)):
        current_len = input_list_lens[j]
        padded_batch[j][:current_len] = input_list[j]

    return padded_batch

def BowFeature(input_list, bow_dictionary):
    '''
    generate Bow Feature for train\val\test src
    '''
    bow_vocab = len(bow_dictionary)
    res_src_bow = np.zeros((len(input_list), bow_vocab), dtype=np.int)
    for idx, bow in enumerate(input_list):
        bow_k = [k for k, v in bow]
        bow_v = [v for k, v in bow]
        res_src_bow[idx, bow_k] = bow_v
    return res_src_bow

def encode_one_hot(inst, vocab_size, label_from):
    '''
    one hot for a value x, int, x>=1
    '''
    one_hots = np.zeros(vocab_size, dtype=np.float32)
    for value in inst:
        one_hots[value-label_from]=1
    return one_hots

In [12]:
# Build training set
tokenized_train_pairs = read_src_trg_files("train")
train_text, train_label, train_bow = build_dataset(tokenized_train_pairs, word2idx, tag2idx, bow_dictionary,"train")

tokenized_valid_pairs = read_src_trg_files('valid')
valid_text, valid_label, valid_bow = build_dataset(tokenized_valid_pairs, word2idx, tag2idx, bow_dictionary,"valid")

tokenized_test_pairs = read_src_trg_files('test')
test_text, test_label, test_bow = build_dataset(tokenized_test_pairs, word2idx, tag2idx, bow_dictionary,"test")

Finish reading 139085 lines of data from /data/pengyu/tag_rec/physics/train_src.txt and /data/pengyu/tag_rec/physics/train_trg.txt
avg_post 75.54165438400977
avg_tag 3.1780206348635724
Finish reading 17386 lines of data from /data/pengyu/tag_rec/physics/valid_src.txt and /data/pengyu/tag_rec/physics/valid_trg.txt
avg_post 75.41107787875302
avg_tag 3.176176233751294
Finish reading 17386 lines of data from /data/pengyu/tag_rec/physics/test_src.txt and /data/pengyu/tag_rec/physics/test_trg.txt
avg_post 76.05734499022202
avg_tag 3.156102611296445


# save final data

In [13]:

def save_data(name):
    data = eval(name)
    
    print('%s saved, shape:'%name, data.shape)
    
def save_data(data, name):
    path = save_path+'/%s.npy'%name
    np.save(path, data, allow_pickle=True) 
    print('%s Saved'%path)

save_path = base_path+ 'processed_data'
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
save_data(train_text, 'train_text')
save_data(train_label, 'train_label')
save_data(train_bow, 'train_bow')
save_data(valid_text, 'valid_text')
save_data(valid_label, 'valid_label')
save_data(valid_bow, 'valid_bow')
save_data(test_text, 'test_text')
save_data(test_label, 'test_label')
save_data(test_bow, 'test_bow')

/data/pengyu/tag_rec/physics/processed_data/train_text.npy Saved
/data/pengyu/tag_rec/physics/processed_data/train_label.npy Saved
/data/pengyu/tag_rec/physics/processed_data/train_bow.npy Saved
/data/pengyu/tag_rec/physics/processed_data/valid_text.npy Saved
/data/pengyu/tag_rec/physics/processed_data/valid_label.npy Saved
/data/pengyu/tag_rec/physics/processed_data/valid_bow.npy Saved
/data/pengyu/tag_rec/physics/processed_data/test_text.npy Saved
/data/pengyu/tag_rec/physics/processed_data/test_label.npy Saved
/data/pengyu/tag_rec/physics/processed_data/test_bow.npy Saved


# TATR

In [None]:
#BoW feature
train_bow_data = data_utils.TensorDataset(torch.from_numpy(train_bow).type(torch.float32))
val_bow_data = data_utils.TensorDataset(torch.from_numpy(valid_bow).type(torch.float32))                                          
test_bow_data = data_utils.TensorDataset(torch.from_numpy(test_bow).type(torch.float32))

train_bow_loader = data_utils.DataLoader(train_bow_data, batch_size, shuffle=True, drop_last=True)
valid_bow_loader = data_utils.DataLoader(val_bow_data, batch_size, shuffle=True, drop_last=True)
test_bow_loader = data_utils.DataLoader(test_bow_data, batch_size, drop_last=True)

#Nomral feature and label
train_data = data_utils.TensorDataset(torch.from_numpy(train_bow).type(torch.float32),
                                      torch.from_numpy(train_text).type(torch.LongTensor),
                                      torch.from_numpy(train_label).type(torch.LongTensor))
val_data = data_utils.TensorDataset(torch.from_numpy(valid_bow).type(torch.float32),
                                    torch.from_numpy(valid_text).type(torch.LongTensor),
                                      torch.from_numpy(valid_label).type(torch.LongTensor))                                          
test_data = data_utils.TensorDataset(torch.from_numpy(test_bow).type(torch.float32),
                                     torch.from_numpy(test_text).type(torch.LongTensor),
                                     torch.from_numpy(test_label).type(torch.LongTensor))

train_loader = data_utils.DataLoader(train_data, batch_size, shuffle=True, drop_last=True)
val_loader = data_utils.DataLoader(val_data, batch_size, shuffle=True, drop_last=True)
test_loader = data_utils.DataLoader(test_data, batch_size, drop_last=True)

label_num = int(train_label.max())
vocab_size = int(train_text.max())+2# +2 Don't Know Why
fp('label_num')
fp('vocab_size')
print("load done")

return train_loader, val_loader, test_loader, label_num, vocab_size, train_bow_loader, \
valid_bow_loader, test_bow_loader, bow_dictionary