In [31]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Dropout, concatenate
from tensorflow.keras.models import Model
import tensorflow_hub as hub

class TextCNN:
    def __init__(self, num_classes, vocab_size, embedding_dim, filter_sizes, num_filters, dropout_rate):
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.dropout_rate = dropout_rate

    def build(self, sequence_length):
        inputs = Input(shape=(sequence_length,), dtype='int32')
        embedding = hub.KerasLayer("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1", trainable=True)(inputs)
        reshape = tf.reshape(embedding, shape=[-1, sequence_length, self.embedding_dim, 1])
        
        conv_0 = Conv1D(self.num_filters, self.filter_sizes[0], activation='relu')(reshape)
        conv_1 = Conv1D(self.num_filters, self.filter_sizes[1], activation='relu')(reshape)
        conv_2 = Conv1D(self.num_filters, self.filter_sizes[2], activation='relu')(reshape)
        
        maxpool_0 = MaxPooling1D(sequence_length - self.filter_sizes[0] + 1)(conv_0)
        maxpool_1 = MaxPooling1D(sequence_length - self.filter_sizes[1] + 1)(conv_1)
        maxpool_2 = MaxPooling1D(sequence_length - self.filter_sizes[2] + 1)(conv_2)
        
        concatenated_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
        flatten = tf.reshape(concatenated_tensor, shape=[-1, 3 * self.num_filters])
        dropout = Dropout(self.dropout_rate)(flatten)
        output = tf.keras.layers.Dense(self.num_classes, activation='sigmoid')(dropout)

        model = Model(inputs, output)
        return model


class SiameseNetwork:
    def __init__(self, textcnn):
        self.textcnn = textcnn

    def build(self, sequence_length):
        left_input = Input(shape=(sequence_length,), dtype='int32')
        right_input = Input(shape=(sequence_length,), dtype='int32')

        encoded_left = self.textcnn.build(sequence_length)(left_input)
        encoded_right = self.textcnn.build(sequence_length)(right_input)
        
        l1_distance = tf.keras.layers.Lambda(self.manhattan_distance, output_shape=self.euclidean_distance_output_shape)([encoded_left, encoded_right])

        prediction = tf.keras.layers.Dense(1, activation='sigmoid')(l1_distance)
        model = Model([left_input, right_input], prediction)
        return model

    def manhattan_distance(self, inputs):
        x, y = inputs
        return tf.keras.backend.exp(-tf.keras.backend.sum(tf.keras.backend.abs(x - y), axis=1, keepdims=True))

    def euclidean_distance_output_shape(self, shapes):
        shape1, shape2 = shapes
        return (shape1[0], 1)




In [11]:
csv_file = "/content/pairs.csv"
csv_data = pd.read_csv(csv_file, low_memory = False)#防止弹出警告
pairs = pd.DataFrame(csv_data)

In [None]:
part_60 = pairs.sample(frac = 0.6)

part_40 = pairs.drop(part_60.index)


In [18]:
train = part_60
test = part_40

In [19]:
train =[part_60['bug1'].values,part_60['bug2'].values]
test =[part_40['bug1'].values,part_40['bug2'].values]

In [20]:
train_label = part_60['label'].values
test_label = part_40['label'].values

In [26]:
train_label = np.where(train_label == -1, 0, train_label)
test_label = np.where(test_label == -1, 0, test_label)


In [32]:
def main():
    # TextCNN hyper-parameters
    num_classes = 2
    vocab_size = 10000
    embedding_dim = 768
    filter_sizes = [3, 4, 5]
    num_filters = 100
    dropout_rate = 0.5

    # SiameseNetwork hyper-parameters
    sequence_length = 30
    batch_size = 128
    epochs = 20
    
    textcnn = TextCNN(num_classes, vocab_size, embedding_dim, filter_sizes, num_filters, dropout_rate)
    siamese_network = SiameseNetwork(textcnn)
    model = siamese_network.build(sequence_length)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Load your data here and start training
    # ...
    model.fit(train, train_label, batch_size=batch_size, epochs=epochs, validation_data=(test, test_label))


if __name__ == '__main__':
    main()



ValueError: ignored