In [2]:
# 关于 python 里的map函数
'''
map会根据提供的函数对指定序列逐一进行映射
所以第一个参数function是映射函数，可以用lambda匿名函数实现
第二个参数就是iterable序列
返回新的序列
'''

def square(x):
    return x**2

map(square, [1,2,3,4,5])

<map at 0x7f2be7ab83d0>

In [4]:
# 这行代码就很简洁的实现了列表中每个元素的操作
# 所以map+lambda适合处理列表等序列化元素的操作
list(map(lambda x: x**2, [1,2,3,4,5,6]))

[1, 4, 9, 16, 25, 36]

In [5]:
list(map(lambda x: str(x), [1,2,3,4,5,6]))

['1', '2', '3', '4', '5', '6']

In [6]:
# 关于 * 在python中的作用，乘法就不说了
# 主要是 *在形参和实参中的作用

In [7]:
# 形参中表示可变长参数，即传入不定个数的位置参数，并且是以元组传入，**表示传入不定个数的关键字
def func(*args, **kwargs):
    print(args)
    print(kwargs)

func(1,2,3,4, name = 'Banksy')

(1, 2, 3, 4)
{'name': 'Banksy'}


In [8]:
# 实参中用于参数解包，比如传入一个列表或者元组，前面加*，就是自动解包
a = [1,2,3,4]
print(a)
print(*a)

[1, 2, 3, 4]
1 2 3 4


In [10]:
max(map(lambda x: x*2, [1,2,3,4,5]))

10

In [19]:
# 关于zip函数
# zip函数用于将可迭代的对象作为参数，将对象中的元素打包成一个元组，返回这些元组组成的列表
# 如果参数使用了*号操作符，就是将解压操作，其实也就是实参中的解包作用，将元组解包为列表
a = [1,2,3]
b = [4,5,6]
zip1 = zip(a,b)
print(list(zip1))
a1, b1 = zip(*zip(a,b))
print(list(a1))
print(list(b1))

# 这里的*很适合数据处理操作

[(1, 4), (2, 5), (3, 6)]
[1, 2, 3]
[4, 5, 6]


In [20]:
# list 的 expand 操作
a = [1,2,3,4,5]
b = [4,3,2,1]
b.extend(a)
print(b)

[4, 3, 2, 1, 1, 2, 3, 4, 5]


In [21]:
# 关于csv文件操作

In [40]:
import csv
data = [
    {'name', 'Banksy'},
    {'name', 'MU'},
    {'name', 'Ling'},
    {'name', 'Zhi'},
]
with open('1.csv', 'w') as f:
    writer = csv.writer(f)
    for i in data:
        writer.writerow(i)
    f.close()

In [41]:
import csv
csv_list = []
with open('1.csv', 'r') as f:
    reader = csv.reader(f)
    for i in reader:
        csv_list.extend([i])
    f.close()
print(csv_list)

[['Banksy', 'name'], ['MU', 'name'], ['name', 'Ling'], ['name', 'Zhi']]


将原txt数据读入成字典dict形式
data_dict.get(s)这里要用get函数，dict.get(key)是返回给定key的value，否则若为空就返回None，因为这里是大字典套小字典，所以我们要先给对象subject初始化一个dict
因为返回的data_dict是一个dict，所以如果要遍历这个dict，就用dict.items()，items是一个可迭代对象，常用于dict遍历中

In [21]:
import csv
def data_loader(data_path):
    data_dict = {}
    with open(data_path, 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        for s,r,o in csv_reader:
            try:
                data_dict[s][r].append(o)
            except KeyError:
                if data_dict.get(s) is None:
                    data_dict[s] = dict()
                data_dict[s][r] = [o]
    return data_dict

data = data_loader('./data/train.txt')
for s, ro in data.items():
    break
print(s)
print(type(ro))
print(len(ro))
for r, o in ro.items():
    print("relation:{0}\nobject:{1}".format(r,o))

/m/027rn
<class 'dict'>
12
relation:/location/country/form_of_government
object:['/m/06cx9', '/m/026wp']
relation:/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics
object:['/m/0l6ny', '/m/0kbws', '/m/0kbvb', '/m/06sks6']
relation:/location/statistical_region/gdp_nominal_per_capita./measurement_unit/dated_money_value/currency
object:['/m/09nqf']
relation:/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/medal
object:['/m/02lq67', '/m/02lq5w', '/m/02lpp7']
relation:/base/aareas/schema/administrative_area/administrative_parent
object:['/m/02j71']
relation:/olympics/olympic_participating_country/athletes./olympics/olympic_athlete_affiliation/olympics
object:['/m/06sks6']
relation:/location/statistical_region/gdp_nominal./measurement_unit/dated_money_value/currency
object:['/m/09nqf']
relation:/location/statistical_region/gdp_real./measurement_unit/adjusted_money_value/adjustment_currency
object:['/m/09nqf']
relation:/ba

In [22]:
def build_dataset(data_dict):
    x, y = list(), list()
    e2index, index2e, r2index, index2r = dict(), dict(), dict(), dict()
    for s, ro in data_dict.items():
        try:
            _ = e2index[s]
        except KeyError:
            index = len(e2index)
            e2index[s] = index
            index2e[index] = s
        
        for r, os in ro.items():
            try:
                _ = r2index[r]
            except KeyError:
                index = len(r2index)
                r2index[r] = index
                index2r[index] = r
            
            for o in os:
                try:
                    _ = e2index[o]
                except KeyError:
                    index = len(e2index)
                    e2index[o] = index
                    index2e[index] = o
            
            x.append((s,r))
            y.append(os)
                    
    return x, y, e2index, index2e, r2index, index2r

x, y, e2index, index2e, r2index, index2r = build_dataset(data)


In [91]:
print(len(x))
print(x[1])
print(len(y))
print(y[1])
print('\n')

print(len(e2index))
print(type(e2index))
for entity, index in e2index.items():
    print("index:{1}\t\tentity:{0}".format(entity, index))
    break
print('\n')

print(len(index2r))
print(type(index2r))
for index, relation in index2r.items():
    print("index:{0}\t\trelation:{1}".format(index, relation))
    break

93372
('/m/027rn', '/olympics/olympic_participating_country/medals_won./olympics/olympic_medal_honor/olympics')
93372
['/m/0l6ny', '/m/0kbws', '/m/0kbvb', '/m/06sks6']


14505
<class 'dict'>
index:0		entity:/m/027rn


237
<class 'dict'>
index:0		relation:/location/country/form_of_government


In [23]:
import os
import pickle

def preprocess_train(data_path):
    data_dict = data_loader(data_path)
    x, y, e2index, index2e, r2index, index2r = build_dataset(data_dict)
    
    data = {
        'x': x,
        'y': y,
        'e2index': e2index,
        'index2e': index2e,
        'r2index': r2index,
        'index2r': index2r
    }
    
    print("#entities:{0} ".format(len(e2index)))
    print("#relations:{0} ".format(len(r2index)))
    
    save_data_path = os.path.splitext(data_path)[0] + '.pkl'
    pickle.dump(data, open(save_data_path, 'wb'))

preprocess_train('./data/train.txt')
    

#entities:14505 
#relations:237 


关于__getattr__ 函数的作用
如果对对象进行属性查询，没查到失败了，那么就会自动调用类的__getattr__函数
如果没有定义这个函数，就会抛出AttributeError

In [96]:
class A(object):
    def __init__(self, a, b):
        self.a = a
        self.b = b
    
    def mydefault(self, *args):
        print('default:' + str(args[0]))
 
    def __getattr__(self, name):
        print("other fn:", name)
        return self.mydefault

a1 = A(10, 20)
a1.asd(666) 

other fn: asd
default:666


In [111]:
import pickle
from util import AttributeDict

with open('./data/train.pkl', 'rb') as f:
    train_data = AttributeDict(pickle.load(f))

with open('./data/train.pkl', 'rb') as f:
    train_data1 = pickle.load(f)
    
print(type(train_data))
print(len(train_data))
print(len(train_data.x))
print(len(train_data.e2index))

print(type(train_data1))
print(len(train_data1.x))

<class 'util.AttributeDict'>
6
93372
14505
<class 'dict'>


关于 python 中 __getsttr__ 的使用
当调用对象的属性时，如果有该属性就调用，没有这个属性就调用 __getattr__ 函数
下面就是关于 util.py 中 AttributeDict的样例

In [116]:
class A:
    def __init__(self, name):
        self.name = name
    def __getitem__(self, key):
        return 'Hello! I am from Class A'
    
class F(A):
    def __init__(self, name):
        self.name = name
    __getattr__ = A.__getitem__
    
obj = F('Banksy Test')
print(obj.name)
print(obj.age)

Banksy Test
Hello! I am from Class A


In [24]:
import os

def preprocess_valid(train_path, valid_path):
    x, y = list(), list()
    with open(train_path, 'rb') as f:
        train_data = AttributeDict(pickle.load(f))
    
    data_dict = data_loader(valid_path)
    
    for s, ro in data_dict.items():
        try:
            _ = train_data.e2index[s]
        except KeyError:
            continue
        
        for r, objects in ro.items():
            try:
                _ = train_data.r2index[r]
            except KeyError:
                continue
            
            filtered_objects = list()
            
            for o in objects:
                try:
                    _ = train_data.e2index[o]
                    filtered_objects.append(o)
                except KeyError:
                    continue
            
            x.append((s,r))
            y.append(filtered_objects)
    
    data = {
        'x':x,
        'y':y,
    }
    
    save_file_path = os.path.splitext(valid_path)[0] + '.pkl'
    pickle.dump(data, open(save_file_path, 'wb'))

In [123]:
preprocess_valid('./data/train.pkl', './data/valid.txt')

In [130]:
with open('./data/valid.pkl', 'rb') as f:
    valid_data = AttributeDict(pickle.load(f))
    
print(len(valid_data))
print(type(valid_data))
print(len(valid_data.x))
print(len(valid_data.y))

2
<class 'util.AttributeDict'>
12072
12072


In [131]:
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description = 'Preprocess knowledge graph csv/txt train/valid data.')
    sub_parsers = parser.add_subparsers(help='mode', dest='mode')
    sub_parsers.required = True
    train_parser = sub_parsers.add_parser('train', help='Preprocess a training set')
    valid_parser = sub_parsers.add_parser('valid', help='Preprocess a valid or test set')
    
    train_parser.add_argument('train_path', type=str, help='Path to the raw train dataset (csv or txt file)')
    
    valid_parser.add_argument('train_path', type=str, help='Path to preprocessed train dataset (pkl file)')
    valid_parser.add_argument('valid_path', type=str, help='Path to raw valid dataset (csv or txt file)')
    
    return parser.parse_args()

In [132]:
def main():
    args = parse_args()
    if args.mode == 'train':
        preprocess_train(args.train_path)
    else:
        preprocess_valid(args.train_path, args.valid_path)



In [4]:
import torch
tensor1 = torch.LongTensor((3,5))
print(tensor1)

tensor([3, 5])


##### 关于 torch.autograd.grad() function
def grad(outputs, inputs, grad_outputs=None, retain_graph=None, ...)
下面的例子是对 y = x^2 进行求导， x y都是张量

torch.autograd.backward() 和 torch.autograd.grad() 都是torch里用来进行自动求导的
二者都需要对 标量求导还是向量求导进行区别，对向量求导时grad_outputs要设置为和torch一样size的向量，就相当于是一个weigh matrix权重矩阵


In [12]:
import torch
 
x = torch.randn(3, 4).requires_grad_(True)
for i in range(3):
    for j in range(4):
        x[i][j] = i + j
y = x ** 2

print("x: {0}".format(x))
print("y: {0}".format(y))

weight = torch.ones(y.size())
# 这里 weight matrix 就是和y一样的size

print("weight: {0}".format(weight))

dydx = torch.autograd.grad(outputs=y,
                           inputs=x,
                           grad_outputs=weight,
                           retain_graph=True,
                           create_graph=True,
                           only_inputs=True)
"""(x**2)' = 2*x """
print(dydx[0])

d2ydx2 = torch.autograd.grad(outputs=dydx[0],
                             inputs=x,
                             grad_outputs=weight,
                             retain_graph=True,
                             create_graph=True,
                             only_inputs=True)
print(d2ydx2[0])

x: tensor([[0., 1., 2., 3.],
        [1., 2., 3., 4.],
        [2., 3., 4., 5.]], grad_fn=<CopySlices>)
y: tensor([[ 0.,  1.,  4.,  9.],
        [ 1.,  4.,  9., 16.],
        [ 4.,  9., 16., 25.]], grad_fn=<PowBackward0>)
weight: tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
tensor([[ 0.,  2.,  4.,  6.],
        [ 2.,  4.,  6.,  8.],
        [ 4.,  6.,  8., 10.]], grad_fn=<MulBackward0>)
tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]], grad_fn=<MulBackward0>)


##### 关于torch.tensor 和 torch.Tensor 的区别
tensor是一个函数，Tensor是一个类
两者最后都会产生一个tensor张量数据类型
tensor(data, dtype, device, requires_grad)

Tensor是一个class，是torch.FloatTensor的别称，所以可见，Tensor类的对象自动就是Float类型

而tensor函数产生的是int对象，进行张量运算时都需要特别指定dtype=torch.Float


In [14]:
t1 = torch.tensor([
    [0.1, 1.2],
    [2.3, 3.2],
    [2.3, 5.6]
])
t2 = torch.tensor([
    [1, 4],
    [3, 4]
], dtype=torch.float64, device=torch.device('cuda:0'), requires_grad=True)

print(t1)
print(t2)

tensor([[0.1000, 1.2000],
        [2.3000, 3.2000],
        [2.3000, 5.6000]])
tensor([[1., 4.],
        [3., 4.]], device='cuda:0', dtype=torch.float64, requires_grad=True)


In [18]:
data = [1,2,3]
T1 = torch.Tensor(data)
T1.requires_grad = True
print(T1)
print(type(T1))

tensor([1., 2., 3.], requires_grad=True)
<class 'torch.Tensor'>


In [20]:
# python iter() 用来生成迭代器
l1 = [1,2,3,4,5]
for i in iter(l1):
    print(i)

1
2
3
4
5


In [38]:
from torch.utils.data import DataLoader
from dataset import KnowledgeGraphDataset
from util import AttributeDict
import dataset

with open('./data/train.pkl', 'rb') as f:
    train_data = AttributeDict(pickle.load(f))

train_dataset = DataLoader(
    KnowledgeGraphDataset(train_data.x, train_data.y,
                          e2index=train_data.e2index,
                          r2index=train_data.r2index),
    collate_fn=dataset.collate_train, 
    batch_size=256, 
    num_workers=4,
    shuffle=True
)



In [43]:
print(len(train_dataset))


365


In [44]:
import time
from tqdm import tqdm
from tqdm._tqdm import trange

for i in tqdm(range(100)):
    time.sleep(0.01)

100%|██████████| 100/100 [00:01<00:00, 97.51it/s]


In [47]:
from tqdm import tqdm

pbar = tqdm(range(300))

for i in pbar:
    err = 'abc'
    pbar.set_description("Reconstruction loss: {0}".format(err))

Reconstruction loss: abc: 100%|██████████| 300/300 [00:00<00:00, 3400.87it/s]


In [None]:
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm import tqdm, trange

from dataset import KnowledgeGraphDataset
import dataset

##### torch.nn.Embedding 
1. nn.Embedding 可以用于词嵌入，就是形成一个映射词典，维护一个这样的映射查找表
每个词都会有一个嵌入向量对应，输出就是嵌入向量集合;
embed = torch.nn.Embedding(num_vocabulary, embedding_dim)
输入为两个参数，词的数量，每个词映射成的维度dim

2. 然后，输入Embedding的word type必须是LongTensor类型
即， torch.nn.Embedding(torch.LongTensor(word))
3. embedding = nn.Embedding(num, dim) 这是声明一个 Embedding Table，也就是embedding就是一个embedding table，这其实就是实现了一个Embedding weight
4. 然后 embed_vector = embedding(input_data) 实际的操作就是将Embedding weight和input_data进行相乘，得到embedded vector，这里实际就是通过一个简单的多层神经网络实现的运算，no bias & no activation function，得到的embedded vector 本质上是一个稠密的向量

In [79]:
import numpy as np 
import torch
import torch.nn as nn
from torch.autograd import Variable

word2id = {'hello': 0, 'world': 1}
embeds = nn.Embedding(2,5)
hello_id = torch.LongTensor([word2id['hello']])
#hello_id = Variable(hello_id)
'''
以前的版本吧这里还需要，对Tensor进行一次Variable封装，但是现在不需要了，Variable本身已经集成到了Tensor里面去了，所以直接用前面的LongTensor就可以了
'''

hello_embed = embeds(hello_id)
print(hello_embed)


tensor([[ 0.6385, -0.6395, -0.7924,  0.2629, -0.0947]],
       grad_fn=<EmbeddingBackward>)


In [74]:
t1 = torch.Tensor(3,4).long()
print(t1.shape)
print(type(t1))

t2 = torch.LongTensor(3,4)
print(t2.shape)
print(type(t2))

torch.Size([3, 4])
<class 'torch.Tensor'>
torch.Size([3, 4])
<class 'torch.Tensor'>


In [98]:
import torch
import torch.nn as nn

word1 = torch.LongTensor([0, 1, 2])
word2 = torch.LongTensor([3, 5, 3])
embedding = nn.Embedding(6, 4)

print('\nembedding weight:')
print(embedding.weight)
print(embedding.weight.t())
# 这里的embedding.weight 是每次随机生成的

print('word1:')
print(embedding(word1))
print('word2:')
print(embedding(word2))


embedding weight:
Parameter containing:
tensor([[-0.1707,  0.5197,  0.2059,  0.4245],
        [-0.5471, -0.1096,  0.9400, -0.2641],
        [ 0.3535, -0.0306,  0.0616,  2.0480],
        [ 0.3477, -0.3344,  1.6420, -1.2653],
        [ 1.5042,  0.7121,  0.3164, -0.0639],
        [-0.3180, -1.4425, -1.6027, -0.3420]], requires_grad=True)
tensor([[-0.1707, -0.5471,  0.3535,  0.3477,  1.5042, -0.3180],
        [ 0.5197, -0.1096, -0.0306, -0.3344,  0.7121, -1.4425],
        [ 0.2059,  0.9400,  0.0616,  1.6420,  0.3164, -1.6027],
        [ 0.4245, -0.2641,  2.0480, -1.2653, -0.0639, -0.3420]],
       grad_fn=<TBackward>)
word1:
tensor([[-0.1707,  0.5197,  0.2059,  0.4245],
        [-0.5471, -0.1096,  0.9400, -0.2641],
        [ 0.3535, -0.0306,  0.0616,  2.0480]], grad_fn=<EmbeddingBackward>)
word2:
tensor([[ 0.3477, -0.3344,  1.6420, -1.2653],
        [-0.3180, -1.4425, -1.6027, -0.3420],
        [ 0.3477, -0.3344,  1.6420, -1.2653]], grad_fn=<EmbeddingBackward>)


##### 关于tensor调整size的方法，view、squeeze、resize
1. view() 方法
view必须保证形状调整前后元素总数一致，不会修改数据本身，且和原tensor共享内存，更改一个，另外一个随之改变
2. sequeeze是减少一个维度，unsequeeze是添加一个维度
3. resize() 也是修改tensor尺寸，但是和view不同在于，resize会自动重新分配内存，数据也可能会发生变化

In [92]:
import torch
a = torch.arange(0,6).view(2, 3)
print(a)

b = a.view(-1, 2)
print(b)

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[0, 1],
        [2, 3],
        [4, 5]])


In [95]:
a = torch.arange(0, 6).view(2, 3)
print(a)
b = a.view(-1, 3)
print(b)
print(b.size())
c = b.unsqueeze(1)
print(c)
print(c.size())

tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[0, 1, 2],
        [3, 4, 5]])
torch.Size([2, 3])
tensor([[[0, 1, 2]],

        [[3, 4, 5]]])
torch.Size([2, 1, 3])


1. 关于 item() 可以从下面看出，对于一个tensor，如果单纯的获取对应元素，那么这个元素仍然属于tensor类型，但是如果是通过item()获取就得到了一个元素，而且精度更高
在求loss的时候，我们常通过loss.item()来获取具体loss值

In [100]:
t1 = torch.rand(2, 2)
print(t1)
print(t1[1][1])
print(t1[1][1].item())

tensor([[0.2525, 0.8337],
        [0.4825, 0.3356]])
tensor(0.3356)
0.3356214761734009


1. 关于torch scatter_ 这个常用来做one-hot编码，输入三个参数
* 第一个参数是dim，沿哪个维度进行操作，0是列向量，1是行向量
* 第二个参数是index，用来进行索引的，必须也是tensor类型
* 第三个参数是src，是一个标量或者张量

In [102]:
import torch
t1 = torch.zeros(3,5)
print(t1)
t1.scatter_(0, torch.tensor([[0, 0, 0, 0, 0], [2, 2, 2, 2, 2]]), 3)
print(t1)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])
tensor([[3., 3., 3., 3., 3.],
        [0., 0., 0., 0., 0.],
        [3., 3., 3., 3., 3.]])


In [105]:
batch_size, dim_num = 4, 10
# 下面这行代码，在训练模型时很常用到，常用来生成one-hot编码，生成一个batch_size * 1 的label，dim=1的情况下
label = torch.LongTensor(batch_size, 1).random_() % dim_num
print(label)
t1_onehot = torch.zeros(batch_size, dim_num).scatter_(1, label, 1)
print(t1_onehot)


tensor([[9],
        [1],
        [9],
        [2]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])


In [137]:
from util import AttributeDict
from dataset import KnowledgeGraphDataset
import dataset
from tqdm import tqdm

batchsize = 256

with open('./data/train.pkl', 'rb') as f:
    data = AttributeDict(pickle.load(f))
    
datasource = KnowledgeGraphDataset(data.x, data.y, e2index=data.e2index, r2index=data.r2index)
train_dataset = DataLoader(datasource, collate_fn=dataset.collate_train, batch_size=batchsize,  shuffle=True)

moving_loss = 0

y_multihot = torch.LongTensor(batch_size, len(data.e2index))

# for s, r, objects in iter(train_dataset):
#     print(s)
#     break


TypeError: 'NoneType' object is not subscriptable

In [115]:
print(len(data))
print(len(data.x))
print(len(data.e2index))
print(len(data.r2index))

6
93372
14505
237


In [117]:
print(len(datasource))
print(train_dataset)

93372
<torch.utils.data.dataloader.DataLoader object at 0x7fafcbb8b490>


In [119]:
y_multihot = torch.LongTensor(256, len(data.e2index))
print(y_multihot.size())

torch.Size([256, 14505])
