In [None]:
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
from torch.utils import data
from torch.optim import lr_scheduler
import requests
from bs4 import BeautifulSoup as bs
from transformers import AutoModel
from sentence_transformers import SentenceTransformer, util

检测当前设备，若显卡可用则设置device=cuda，否则设为cpu
example用于决定读取数据的大小

In [None]:
def try_gpu(i=0):
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

device = try_gpu()
example = False
typ = torch.float32
device

# 数据

## 设置标签

获取大小类，通过对网页的信息处理获得大小类标签相关信息

In [None]:
if os.path.exists(r'Hugging _dataset.html'):
    with open(r'Hugging _dataset.html','r',encoding='utf-8') as f:
        dataset_html = f.read()
else:
    dataset_web_respond = requests.get(url=r'https://huggingface.co/datasets',verify=False)
    dataset_html = dataset_web_respond.text
soup = bs(dataset_html,'html.parser')
tags = soup.select('div.mb-20')[0].contents
btags = []
for i in tags:
    try:
        if 'mb-3' in i.get('class'):
            btags.append(i)
    except Exception:
        pass
cls = dict()
scls = []
for btag in btags:
    blabel = btag.div.string.rstrip('\t').rstrip('\n')
    temp = []
    for a in btag.find_all('a'):
        slabel = a.span.string
        scls.append(slabel)
        temp.append(slabel)
    cls[blabel]=temp
s_class_count = 0
for k,v in cls.items():
    s_class_count += len(v)
    # print(k,v)
print('小类总数',s_class_count)

In [None]:
if os.path.exists(r'Hugging _dataset.html'):
    with open(r'Hugging _dataset.html','r',encoding='utf-8') as f:
        dataset_html = f.read()
else:
    dataset_web_respond = requests.get(url=r'https://huggingface.co/datasets',verify=False)
    dataset_html = dataset_web_respond.text
soup = bs(dataset_html,'html.parser')
tags = soup.select('div.mb-20')[0].contents
btags = []
for i in tags:
    try:
        if 'mb-3' in i.get('class'):
            btags.append(i)
    except Exception:
        pass
    
bcls = []
for btag in btags:
    blabel = btag.div.string.rstrip('\t').rstrip('\n')
    bcls.append(blabel)

print(len(bcls))


bc2h = dict()
label = np.eye(7,7,dtype=np.float64)
k = 0
for v in bcls:
        # print(v)
    bc2h[v.replace(' ','_')] = label[k]
    k += 1
# c2h['Graph_Machine_Learning'].shape
bc2h

### one_hot编码
发现小类一共46个，则可以用维度46的向量作为标签
通过得到的映射关系构建c2h字典，将小类名映射到标签

In [None]:
c2h = dict()
label = np.eye(46,46,dtype=np.float64)
k = 0
for _,v in cls.items():
    for c in v :
        # print(v)
        c2h[c.replace(' ','_')] = label[k]
        k += 1
c2h['Graph_Machine_Learning'].shape

获取数据，读取上次爬取到的数据

In [None]:
df = pd.read_csv('data2.csv',sep='\t')
# if example:
#     df = df[:10]
df

## 数据预处理
- 空缺值处理
- 文本向量化
- dataframe格式变换

空缺值处理

- 获取到的网页数据有部分缺失，使用-1替换
- 对于数据简介，其中有许多无意义的换行符号，手动进行处理
- 对于url,前面的https://huggingface.co/datasets/无意义，手动删除

In [None]:
df.iloc[:,-1] = df.iloc[:,-1].apply(lambda x:x.replace('\\n','').replace('\\t',''))
df.iloc[:,0] = df.iloc[:,0].apply(lambda x:x.replace('https://huggingface.co/datasets/',''))
df.fillna(value=-1,inplace=True)
df.head(10)

文本向量化


借助网络上预处理好的模型将数据简介和数据名向量化，从而可以被处理


选择的模型可以将文本转为512维的向量

In [None]:
def generate_embeddings():
        """
        文本转向量
        """
        # 返回numpy数据
        TextEmbedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)
        return TextEmbedding_model
embedding_model = generate_embeddings()

def m_embedding(text):
    """
    文本转向量   
    :param text: 
    :return: tensor
    """
    embeddings = embedding_model.encode(text,max_length=3487)
    return embeddings

计算简介最大长度，作为参数传给编码器

In [None]:
m = 0
for i in df.iloc[:,-1].to_list():
    l = i.split().__len__()
    m = max(m,l)
print('max_length',m)

In [None]:
df.shape

In [None]:
# 对数据简介和数据名进行编码
x_encode = df.iloc[:,[0,-1]].applymap(lambda x:m_embedding(x) if type(x)==str else x)

In [None]:
# 转为numpy对象
x = x_encode.to_numpy()
x_ = []
for i in range(x.shape[0]):
    ls = []
    x_.append(list(np.append(x[i][0],x[i][-1])))
x = np.array(x_)
x1 = x
print(x1.shape,x1.dtype)
x1

处理下载次数等数据信息

In [None]:
x_ = df.iloc[:,1:4]
def f(x):
    try:
        x = float(x)
    except Exception:
        x = -1
    return x
    
x2 = x_.applymap(f).fillna(value=-1).to_numpy()
print(x2.shape,x2.dtype)
x2

合并文本编码后的数据


长度为512*2+3

In [None]:
x = np.concatenate((x2,x1), axis=1)
print(type(x),x.shape,x.dtype)
x

制作标签

使用之前得到的c2h小类转为one_hot编码

最终得到46维向量

In [None]:
df

In [None]:
y = df.loc[:,'sclass'].apply(lambda x:np.array(c2h[x]))

y = np.vstack(y)
print(type(y),y.shape,y.dtype)
y

处理缺失值

由于部分数据集没有相关数据，所以用0代替

In [None]:
x[:,:3][x[:,:3] == -1] = np.nan
x[:,:3][x[:,:3] == 0] = np.nan

x = pd.DataFrame(x)
# xx == np.nan
x = pd.concat([x.iloc[:,:3].isna().astype(float),x.fillna(-1)],axis=1)
x[:10]

格式转换，将数据转为可以用于训练的格式并存储起来

In [None]:
by.shape

In [None]:
x = np.array(x)
x = torch.from_numpy(x)
y = torch.from_numpy(y)
by = torch.from_numpy(by)
x = x.to(device)
y = y.to(device)
by = by.to(device)
torch.save((x,y,by),'bdata.data')
x.shape,y.shape,x.dtype,y.dtype

由于发现直接编码后的数据用来训练网络效果很差，所以继续将46个小类标签也编码，随后计算他们相关度，将512*2维转为46*2维

In [None]:


x = pd.DataFrame(x)
y = pd.DataFrame(y)
by = pd.DataFrame(by)
n = pd.concat([x,y,by],axis=1)
non_duplicate_indices = n.iloc[:, :1030].drop_duplicates().index

n = n.loc[non_duplicate_indices]
x = torch.Tensor(n.iloc[:,:1030].values)
y = torch.Tensor(n.iloc[:,1030:1030+46].values)
by = torch.Tensor(n.iloc[:,1030+46:].values)


# 计算相关性的函数
cosine_fn = util.cos_sim

# 记录最后的相关度
df = pd.DataFrame(np.zeros([15347, 2*46]))

# 小类到编码的映射
s2e = dict()
for k,v in cls.items():
    s_class_count += len(v)
    for s in v:
        lab = m_embedding(k+s)
        s2e[s] = lab
        
# 计算相关度
for i in range(15347):
    for j,(k,v) in enumerate(s2e.items()):
        df.iloc[i,j] = cosine_fn(v,x[i,:512]).numpy()[0,0]
        df.iloc[i,j+46] = cosine_fn(v,x[i,512:]).numpy()[0,0]


x = pd.DataFrame(x)
df = pd.concat([x.iloc[:,:6],df],axis=1).shape

my_array = np.array(df)
my_tensor = torch.tensor(my_array)
torch.save(my_tensor,'data')

In [None]:
x,y = torch.load('data.data')
pd.DataFrame(y)

# 神经网络

数据装载和训练函数

数据装载函数可以返回训练集和测试集

训练函数中添加了保存模型和可视化

In [None]:
by

In [None]:
# 数据装载
def load_array(data_array,batch_size):
    """
    分割数据集
    """
    dataset = data.TensorDataset(*data_array)
    # 30%作为验证集
    val_size = int(len(dataset) * 0.3)  
    train_size = len(dataset) - val_size
    # 随机产生训练集和测试集
    train_dataset, val_dataset = data.random_split(dataset, [train_size, val_size])
    # 迭代器
    train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    return train_loader,val_loader


def train(data_loader,lr,net,epochs,optimizer=None,scheduler=None,device=device,flag='no_name'):
    # 记录最好的准确率
    best_val_acc = -1
    # 加载数据
    (train_loader,val_loader) = data_loader
    # 模型位置调整
    m_net = net.to(device)
    loss_fn = nn.CrossEntropyLoss()
    loss_fn.to(device)
    if optimizer is None:
        optimizer = torch.optim.Adam(net.parameters(),lr=lr,weight_decay=1e-4)
        scheduler = lr_scheduler.StepLR(optimizer,step_size=300,gamma=0.9)

    train_loss,train_acc,valid_acc = [],[],[]
    no_increase = 0
    for epoch in range(epochs):
        # 训练
        net.train()
        train_epoch_loss,train_epoch_acc,valid_epoch_acc = 0,0,0
        data_count = 0
        for i,(xx,yy) in enumerate(train_loader):
            # 前向传播
            y_het = m_net(xx)
            # 交叉熵损失
            l =  loss_fn(y_het,yy)
            # 梯度清零
            optimizer.zero_grad()
            # 反向传播
            l.backward()
            train_loss.append(l)
            optimizer.step()
            data_count += int(xx.shape[0])
            train_epoch_loss += l.data
            a = int(torch.sum(torch.argmax(y_het, axis=1) == torch.argmax(yy, axis=1)))
            train_epoch_acc += a
        # 记录训练的损失值和准确率
        train_loss.append(train_epoch_loss)
        train_acc.append(train_epoch_acc / data_count)
        # 测试
        net.eval()
        data_count = 0
        for i,(xx,yy) in enumerate(val_loader):
            y_het = m_net(xx)
            data_count += int(xx.shape[0])
            a = int(torch.sum(torch.argmax(y_het, axis=1) == torch.argmax(yy, axis=1)))
            valid_epoch_acc += a
        # 测试准确率    
        valid_acc.append(valid_epoch_acc / data_count)
        if valid_acc[-1] > best_val_acc:
            # 保存最好的模型
            no_increase = 0
            best_val_acc = valid_acc[-1]
            torch.save(m_net.state_dict(),flag + 'best_model.pkl')
            print(f'---best_model_release  epoch {epoch + 1},train_loss {train_loss[-1]:.3f},train_acc {train_acc[-1]:.3f},valid_acc {valid_acc[-1]:.3f}')
        elif best_val_acc - valid_acc[-1] > 0.2:
            # 如果模型过差，则可以重新加载最好的模型再次开始
            m_net = torch.load(flag + 'best_model.pkl')
            no_increase += 1
            print('!!!train_fail,reload_model,T_T')
        else:
            no_increase += 1
        if no_increase == 1000:
            # 长时间没有提升
            return 
        if epoch % 100 == 0:
            print(f'  epoch {epoch + 1},train_loss {train_loss[-1]:.3f},train_acc {train_acc[-1]:.3f},valid_acc {valid_acc[-1]:.3f}')

    # 学习率优化
    scheduler.step()


## 构建网络

In [None]:
# 组件1

def sblock(input_size,ouput_size):
    "1个全连接层"
    return nn.Sequential(
        nn.BatchNorm1d(input_size),
#         nn.Dropout(0.1),
        nn.Linear(input_size,ouput_size),
        nn.ReLU()
    )

# 组件2
class inception(nn.Module):
    """
    并行网络
    """
    def __init__(self,in_size,c1_size,c2_size,c3_size,c4_size,**kwargs):
        super(inception,self).__init__(**kwargs)
        self.l1 = sblock(in_size,c1_size)
        self.l2 = sblock(in_size,c2_size)
        self.l3 = sblock(in_size,c3_size)
        self.l4 = sblock(in_size,c4_size)
    
    def forward(self,x):
        return torch.cat((self.l1(x),self.l2(x),self.l3(x),self.l4(x)),dim=1)

# 组件3
class inception_re(nn.Module):
    """
    并行网络
    """
    def __init__(self,c1_size,c2_size,c3_size,out1_size,out2_size,out3_size,**kwargs):
        super(inception_re,self).__init__(**kwargs)
        self.c1 = c1_size
        self.c2 = c2_size + c1_size 
        self.c3 = c3_size + c2_size

        self.l1 = sblock(c1_size,out1_size)
        self.l2 = sblock(c2_size,out2_size)
        self.l3 = sblock(c3_size,out3_size)
    
    def forward(self,x):
        return torch.cat(
            (
                self.l1(x[:,:self.c1]),
                self.l2(x[:,self.c1:self.c2]),
                self.l3(x[:,self.c2:])
            ),dim=1)

    
# 初始化参数
def init_xavier(m):
    """
    xavier初始化可以避免梯度爆炸、梯度消失
    :param m: 
    :return: 
    """
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)


In [None]:
# 瞎写的
class Go_net(nn.Module):
    """
    训练用网络
    """
    def __init__(self,input_size=98):
        """
        全连接+(并行+全连接)*n+全连接
        :param input_size: 
        :param output_size: 最后分类个数
        """
        super(Go_net,self).__init__()
        self.inception_re1 = inception_re(6,46,46,8,124,124)

        # self.sblock1 = sblock(input_size,128)
        
        
        # self.sblock2 = sblock(1024,1024)
        self.inception1 = inception(256,256,256,256,256)

        # self.sblock3 = sblock(2048,1024)
        # self.inception2 = inception(1024,64,64,128,256)
        
        self.sblock5 = sblock(1024,128)
        # self.inception3 = inception(256,32,32,64,128)

        self.sblock7 = sblock(128,46)   
        
        self.sequential = nn.Sequential(
            # self.inception_re1,
            # self.sblock1,
            self.inception_re1,
            self.inception1,
            # self.sblock2,
            # self.sblock3,
            # self.inception2,
            # self.sblock4,
            self.sblock5,
            # self.inception3,
            # self.sblock6,
            self.sblock7
        )  
        
    def forward(self,x):
        """
        前向传播
        :param X: 
        :return: 
        """
#         x = self.sblock1(x)
#         x = self.sblock2(x)
#         x = self.inception1(x)
# #         x = self.sblock3(x)
# #         x = self.inception2(x)
#         x = self.sblock4(x)
#         x = self.sblock5(x)  
#         return x
        return self.sequential(x)


In [None]:
# 参数及网络初始化
batch_size = 1024
lr = 0.0001
epoch = 20000
m_Go_net = Go_net().to(torch.float32).to(device)

m_net = m_Go_net
m_net_name = 'm_Go_net'
m_net.apply(init_xavier)

x = x.to(torch.float32).to(device)
y = y.to(torch.float32).to(device)
m_net.to(device)
dataset = load_array((x,y),batch_size)

# 优化器
optimizer = torch.optim.Adam(m_net.parameters(),lr=lr,weight_decay=1e-4)
scheduler = lr_scheduler.StepLR(optimizer,step_size=100,gamma=0.9)

# 训练
ok_model = train(data_loader=dataset,lr=lr,net=m_net,epochs=epoch,optimizer=optimizer,scheduler=scheduler,device=device,flag=m_net_name)

In [None]:
c2i = dict()
for k,v in c2h.items():
    for kk,vv in cls.items():
        kkk = k.replace('_',' ')
        if kkk in vv:
            c2i[v.argmax()] = kk + ' ' + k

In [None]:
# 小类检测
m_Go_net_test = Go_net()
m_Go_net_test.load_state_dict(torch.load('go_re/m_Go_netbest_model.pkl',map_location=device))
m_Go_net_test.to(torch.bfloat16).to(device)

val_data,val_label = torch.load('data.data')

dataset = data.TensorDataset(val_data,val_label)
val_loader = data.DataLoader(dataset, batch_size=1, shuffle=True)


def test(data):
    with torch.no_grad():
        m_Go_net_test.eval()
        # 确保数据转换为BFloat16
        data = data.to(torch.bfloat16)
        return m_Go_net_test(data)
    

t = 0
acc = 0
for x,y in val_loader:
    t+=1
    if c2i[int(test(x).argmax(1))] == c2i[int(y.argmax(1))]:
        acc += 1
print(acc/t)

In [None]:
class Go_net(nn.Module):
    """
    训练用网络
    """
    def __init__(self,input_size=98):
        """
        全连接+(并行+全连接)*n+全连接
        :param input_size: 
        :param output_size: 最后分类个数
        """
        super(Go_net,self).__init__()
#         self.inception_re1 = inception_re(6,46,46,8,60,60)

        # self.sblock1 = sblock(input_size,128)
        
        
        # self.sblock2 = sblock(1024,1024)
        self.inception1 = inception(input_size,128,128,128,128)

        # self.sblock3 = sblock(2048,1024)
        # self.inception2 = inception(1024,64,64,128,256)
        
        self.sblock5 = sblock(512,256)
        # self.inception3 = inception(256,32,32,64,128)

        self.sblock7 = sblock(256,7)   
        
        self.sequential = nn.Sequential(
            # self.inception_re1,
            # self.sblock1,
#             self.inception_re1,
            self.inception1,
            # self.sblock2,
            # self.sblock3,
            # self.inception2,
            # self.sblock4,
            self.sblock5,
            # self.inception3,
            # self.sblock6,
            self.sblock7
        )  
        
    def forward(self,x):
        """
        前向传播
        :param X: 
        :return: 
        """
#         x = self.sblock1(x)
#         x = self.sblock2(x)
#         x = self.inception1(x)
# #         x = self.sblock3(x)
# #         x = self.inception2(x)
#         x = self.sblock4(x)
#         x = self.sblock5(x)  
#         return x
        return self.sequential(x)


def sblock(input_size,ouput_size):
    "1个全连接层"
    return nn.Sequential(
        nn.BatchNorm1d(input_size),
        nn.Dropout(0.4),
        nn.Linear(input_size,ouput_size),
        nn.ReLU()
    )


m_Go_net_test = Go_net()
m_Go_net_test.load_state_dict(torch.load('go_re_bc/m_Go_netbest_model.pkl',map_location=device))
m_Go_net_test.to(torch.bfloat16).to(device)
t = torch.load('bdatae')
val_data,val_label = t[0],t[1]

dataset = data.TensorDataset(val_data,val_label)
val_loader = data.DataLoader(dataset, batch_size=1, shuffle=True)


def test(data):
    with torch.no_grad():
        m_Go_net_test.eval()
        # 确保数据转换为BFloat16
        data = data.to(torch.bfloat16)
        return m_Go_net_test(data)
    

t = 0
acc = 0
for x,y in val_loader:
    t+=1
    if int(test(x).argmax(1)) == int(y.argmax(1)):
        acc += 1
print(acc/t)

In [None]:
x,y = torch.load('data.data')
x = pd.DataFrame(x.data)
y = pd.DataFrame(y.data)

In [None]:
y

In [None]:
by = pd.concat(
    [y.iloc[:,[0]].sum(axis=1),
     y.iloc[:,1:18].sum(axis=1),
     y.iloc[:,18:33].sum(axis=1),
     y.iloc[:,33:39].sum(axis=1),
     y.iloc[:,39:43].sum(axis=1),
     y.iloc[:,43:45].sum(axis=1),
     y.iloc[:,45:].sum(axis=1)
     ],
    axis=1)
by


In [None]:
torch.save([x,by],'bdatae')


In [None]:
# numpy_array = by.values
# by = torch.tensor(numpy_array)

In [None]:
by.shape

In [None]:
z = torch.load('bdatae.data')
x,byy = z[0],z[1]
x.shape,byy.shape

In [None]:
# 支持向量机模型
class svm_net(nn.Module):
    def __init__(self,input_size=1030):
        super(svm_net,self).__init__()
        self.sblock1 = sblock(input_size,2048)
        self.sblock2 = sblock(2048,512)
        self.sblock4 = sblock(512,256)
        self.sblock5 = sblock(256,7)
        
    def forward(self,x):
        x = self.sblock1(x)
        x = self.sblock2(x)
        # x = self.sblock3(x)
        x = self.sblock4(x)
        x = self.sblock5(x)
        return x
    
    
    # 参数及网络初始化
batch_size = 2048
lr = 0.01
epoch = 20000
m_Go_net = Go_net().to(torch.float32).to(device)

m_svm_net = svm_net(98)

m_net = m_svm_net
m_net_name = 'm_svm_net'
m_net.apply(init_xavier)
# x,by = torch.load('bdata.data')
x = x.to(torch.float32).to(device)
by = by.to(torch.float32).to(device)
m_net.to(device)
dataset = load_array((x,by),batch_size)

# 优化器
optimizer = torch.optim.Adam(m_net.parameters(),lr=lr,weight_decay=1e-4)
scheduler = lr_scheduler.StepLR(optimizer,step_size=100,gamma=0.9)

ok_model = train(data_loader=dataset,lr=lr,net=m_net,epochs=epoch,optimizer=optimizer,scheduler=scheduler,device=device,flag=m_net_name)

In [None]:

ok_model = train(data_loader=dataset,lr=lr,net=m_net,epochs=epoch,optimizer=optimizer,scheduler=scheduler,device=device,flag=m_net_name)