In [5]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 准备数据
texts = ["I love this movie", "This movie is great", "I dislike this movie", "This movie is terrible"]
labels = [1, 1, 0, 0]

# 文本预处理
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
vocab_size = len(tokenizer.word_index) + 1
max_seq_length = max([len(seq) for seq in sequences])

# 序列填充
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

# 构建Bi-LSTM模型
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Bidirectional(LSTM(units=32)))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(np.array(x_train), np.array(y_train), batch_size=16, epochs=10, validation_data=(np.array(x_test), np.array(y_test)))

# 使用模型进行预测
test_sequences = tokenizer.texts_to_sequences(["This movie is amazing"])
test_data = pad_sequences(test_sequences, maxlen=max_seq_length)
prediction = model.predict(np.array(test_data))
print(prediction)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[0.55445606]]


In [15]:
import pandas as pd
import networkx as nx
import numpy as np
from gensim.models import Word2Vec

# 读取train_90.csv文件
df = pd.read_csv('train_90.csv')

# 读取边文件
edges = pd.read_csv('edge_90.csv')

# 创建一个空的有向图
G = nx.DiGraph()

# 遍历每一行数据
for i in range(len(df)):
    geohash_id = df.loc[i, 'geohash_id']
    date_id = df.loc[i, 'date_id']

    # 添加节点
    G.add_node(geohash_id)

    # 添加节点属性
    node_attrs = {f'F_{j+1}': df.loc[i, f'F_{j+1}'] for j in range(35)}
    G.nodes[geohash_id].update(node_attrs)

# 添加边到图中
for index, row in edges.iterrows():
    G.add_edge(row['geohash6_point1'], row['geohash6_point2'], weight_f1=row['F_1'], weight_f2=row['F_2'])

# 添加节点属性
active_index = {row.geohash_id: row.active_index for row in df.itertuples(index=False)}
consume_index = {row.geohash_id: row.consume_index for row in df.itertuples(index=False)}
nx.set_node_attributes(G, active_index, 'active_index')
nx.set_node_attributes(G, consume_index, 'consume_index')

# 打印图的节点和边数量
print("图中的节点数量：", G.number_of_nodes())
print("图中的边数量：", G.number_of_edges())

图中的节点数量： 1155
图中的边数量： 458013


In [51]:
# 增加获取邻居节点的函数
def get_neighbors(G, node):
    return list(G.neighbors(node))

# 创建空的特征对应值
features_values = {}
for node in G.nodes():
    # 确认'F_1'到'F_35'在df中都存在对应的列，如无需要进行删除修改
    # 这里我们只获取存在的值
    features_values[node] = [G.nodes[node][f'F_{j+1}'] for j in range(35) if f'F_{j+1}' in G.nodes[node]]

# 初始化空的序列列表
sequences = []
for node in G.nodes():
    # 对于每个节点，得到它的邻居节点并将其与自身一同作为序列输入
    sequences.append([node] + get_neighbors(G, node))

# 使用序列训练word2vec模型
model = Word2Vec(sequences, vector_size=35, window=5, min_count=1, workers=4)

# 获取每个节点的向量表示
node_vectors = {}
for node in G:
    node_vectors[node] = model.wv[node]

#用node_vectors作为预测模型的输入
print("转换向量成功")

转换向量成功


In [50]:
import torch
import torch.nn as nn
import random
import torch.optim as optim
import torch.nn.functional as F

class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # 2 for bidirection

    def forward(self, x):
        x = x.unsqueeze(dim=0)  # 增加批处理维度
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])    # 取最后一个时间步
        return out.squeeze(dim=0)   # 移除批处理维度



# 转化栅格数据为tensor
# 这里是一个假设的例子，根据你实际的node_vectors来生成输入数据
# 获得向量和标签列表，确保顺序一致
nodes = list(node_vectors.keys())

labels = {node: random.randint(0, 1) for node in nodes}

X_list = [node_vectors[node] for node in nodes]
y_list = [labels[node] for node in nodes]  # 假设你有一个和nodes对应的标签字典labels

# 转化数据为tensor
X = torch.tensor(X_list, dtype=torch.float32)
y = torch.tensor(y_list, dtype=torch.float32)

# 初始化模型
input_dim = len(node_vectors[nodes[0]])  #输入维度取决于序列的特征数目
hidden_dim = 128
num_layers = 3
output_dim = 1    # 输出维度取决于你的任务，例如二分类任务这里为1

model = BiLSTM(input_dim, hidden_dim, num_layers, output_dim)

# 设置损失函数和优化器
criterion = torch.nn.MSELoss()    # 使用均方误差作为损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) ## 优化方式为Adam梯度下降方法，学习率为0.001

# 训练模型
for epoch in range(1000):    # 你设定的训练轮数
    model.train()
    outputs = model(X)

    optimizer.zero_grad()
    loss = criterion(outputs, y)   # 计算损失
    loss.backward()                # 反向传播
    optimizer.step()               # 更新权重
    if (epoch+1) % 100 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 50, loss.item()))

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [100/50], Loss: 0.2497
Epoch [200/50], Loss: 0.2497
Epoch [300/50], Loss: 0.2497
Epoch [400/50], Loss: 0.2497
Epoch [500/50], Loss: 0.2497


KeyboardInterrupt: 