In [1]:
import pandas as pd
import sklearn as sk
from sklearn.cluster import KMeans
import numpy as np
from random import randint
from sklearn.metrics.pairwise import cosine_similarity
import time
import gensim
import os
import collections
import smart_open
import random

import networkx as nx 
import turicreate as tc
import tqdm
import operator



### 1.Load Original network ...+ Relation Network

In [2]:
QuestionList = []
EmbeddingList = []

# Read node2vec representation
with open("../data/vec_all_node2vec.txt") as f:
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i % 10000 == 0:
            print(i)
        QuestionId = int(line.split()[0])
        QuestionList.append(QuestionId)
        #EmbeddingList.append(line.split()[1:])
        EmbeddingList.append([float(x) for x in line.split()[1:]])
        #print(EmbeddingList[0])
        #Embedding = line.split()[1:]
        #print(QuestionId, Embedding)
embedding = np.asarray(EmbeddingList)

# Read-in tri_dnr representation
tri_dnr = []
with open("../data/triDNR_rep.txt","r") as f:
    for i, line in enumerate(f):
        if i % 10000 == 0:
            print(i)
        QuestionId = int(line.split()[0])
        #val_list.append(QuestionId)
        tri_dnr.append([float(x) for x in line.split()[1:]])
        #Embedding = line.split()[1:]
        #print(QuestionId, Embedding)
tri_dnr = np.asarray(tri_dnr)

# Read-in text data
def read_in_text_data(para, QuestionList):
    data_list = [""] * len(QuestionList)
    if para == "body":
        with open("../src/Body_data.txt","r") as fp:
            lines = fp.read()
    if para == "title":
        with open("../src/Title_data.txt","r") as fp:
            lines = fp.read()
    if para == "tag":
        with open("../src/Tags_data.txt","r") as fp:
            lines = fp.read()
    line_list = lines.split("!!!---!!!-------------------------------------------------------------\n")
    data_list = [""] * len(QuestionList)
    line_list.pop(0)
    for data in line_list:
        questionId = int(data.split("\n")[0])
        question_index = QuestionList.index(questionId)
        data_list[question_index] = data.split("\n")[1][1:-1]
        
    return data_list

titleList = read_in_text_data("title", QuestionList)
tagList = read_in_text_data("tag", QuestionList)


# Read-in duplication pairs
rows = []
with open("../data/duplicat_pairs.txt", "r") as fp:
    for i, line in enumerate(fp):
        Q_1 = int(line.split()[0])
        Q_2 = int(line.split()[1])
        rows.append((Q_1, Q_2))
postId_list = [x[0] for x in rows]
matched_list = [x[1] for x in rows]
coverd = set(QuestionList) & set(postId_list)
coverd_match = []
for i in coverd:
    index = postId_list.index(i)
    match = matched_list[index]
    if match in QuestionList:
        coverd_match.append((i,match))
print(len(coverd_match))
print(len(QuestionList))

10000
20000
30000
40000
0
10000
20000
30000
40000
2222
44412


In [3]:
import pprint, pickle

def generating_title_text_features(method="TF-IDF"):
    if method == "d2c":
        with open("../data/title_d2v.pkl",'rb') as f:
            vec = pickle.load(f)
    elif method == "w2c":
        with open("../data/title_w2v.pkl",'rb') as f:
            vec = pickle.load(f)
    elif method == "TF-IDF":
        with open("../data/pca_tfidf.pkl",'rb') as f:
            vec = pickle.load(f)
    else:
        raise ValueError('Not defiend method: {}'.format(method))

    return vec

### 2. Sample True/False pairwise link

In [4]:
def generating_feature(feature1, feature2):
    f1 = np.concatenate((feature1,feature2))
    f2 = feature1 + feature2
    f3 = feature1 - feature2
    f4 = feature1 *feature2
    return np.concatenate([f1,f2,f3,f4], axis=0)

In [5]:
# k : negative sampling ratio, feature_method for selecting node representation
import random
def generate_data(k=5, feature_method="triDnr"):
    if feature_method == "triDnr":
        features = tri_dnr
    elif feature_method == "node2vec":
        features = embedding
    else:
        features = generating_text_features(feature_method)
    
    features = np.asanyarray(features)
    #postive_sample_index = [QuestionList.index(index[0]) for index in coverd_match]
    #postive_sample = list(features[postive_sample_index])
    postive_sample = list()
    negative_sample = list()
    
    for index in coverd_match:
        origin = index[0]
        duplication = index[1]
        
        original_feature = features[QuestionList.index(origin)]
        duplication_feature = features[QuestionList.index(duplication)]
        postive_sample.append(generating_feature(original_feature,duplication_feature))
        
        # select K negative sample (by question Id)
        false_nodes = set()
        
        while len(false_nodes) < k:
            false_node = random.choice(QuestionList)
            if false_node in false_nodes or false_node == duplication:
                continue
            else:
                false_nodes.add(false_node)
                                
        for node in false_nodes:
            node_feature = features[QuestionList.index(node)]
            negative_sample.append(generating_feature(original_feature, node_feature))
    
    #print(len(postive_sample[0]))
    postive_df = pd.DataFrame(data=postive_sample)
    postive_df['label'] = 1
    
    negative_df = pd.DataFrame(data=negative_sample)
    negative_df['label'] = 0
    
    return postive_df.append(negative_df).reset_index(drop=True)


In [6]:
data = generate_data()
print(data.isnull().values.any())
data.tail()

False


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,label
13327,-0.048329,0.204892,-0.398776,-0.345779,-0.246285,-0.92692,0.307822,0.056094,-0.356226,-0.51482,...,0.015278,0.111212,-0.080474,-0.012642,-0.070426,0.017351,0.011926,0.162779,-0.060195,0
13328,-0.048329,0.204892,-0.398776,-0.345779,-0.246285,-0.92692,0.307822,0.056094,-0.356226,-0.51482,...,-0.027648,0.206616,0.220193,-0.081247,0.705522,0.022888,-0.010088,-0.096915,-0.005339,0
13329,-0.048329,0.204892,-0.398776,-0.345779,-0.246285,-0.92692,0.307822,0.056094,-0.356226,-0.51482,...,-0.043687,0.183477,0.089562,-0.046716,-0.008383,0.064136,0.028604,0.106633,0.091731,0
13330,-0.048329,0.204892,-0.398776,-0.345779,-0.246285,-0.92692,0.307822,0.056094,-0.356226,-0.51482,...,-0.025514,0.082782,-0.084981,-0.113158,-0.316173,0.123388,0.005101,0.289243,-0.063169,0
13331,-0.048329,0.204892,-0.398776,-0.345779,-0.246285,-0.92692,0.307822,0.056094,-0.356226,-0.51482,...,-0.051965,-0.038213,-0.162514,0.041413,0.037231,-0.012559,0.018455,0.339,0.01978,0


In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.4)

for train_index, validate_index in split.split(data, data['label']):
    train_data = data.loc[train_index]
    validate_data = data.loc[validate_index]

'''
train_label = train_data['label']
train_data.drop('label', axis=1, inplace = True)
train_label.reset_index(drop=True, inplace=True)
train_data.reset_index(drop=True, inplace=True)

validate_label = validate_data['label']
validate_data.drop('label', axis=1, inplace = True)
validate_label.reset_index(drop=True, inplace=True)
validate_data.reset_index(drop=True, inplace=True)

all_train = np.concatenate([train_data, validate_data])
all_label = np.concatenate([train_label, validate_label])
'''
#train_data.shape, train_label.shape, all_train.shape, all_label.shape

"\ntrain_label = train_data['label']\ntrain_data.drop('label', axis=1, inplace = True)\ntrain_label.reset_index(drop=True, inplace=True)\ntrain_data.reset_index(drop=True, inplace=True)\n\nvalidate_label = validate_data['label']\nvalidate_data.drop('label', axis=1, inplace = True)\nvalidate_label.reset_index(drop=True, inplace=True)\nvalidate_data.reset_index(drop=True, inplace=True)\n\nall_train = np.concatenate([train_data, validate_data])\nall_label = np.concatenate([train_label, validate_label])\n"

### 3. Using MLP classifier on different title features: 
> Node2Vec, Line, Title/Body-Doc2Vec/Word2Vec/TF-IDF, TriDnr(Dim:100)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as transforms
from torchvision.utils import make_grid

In [19]:
class dataframeDataset(Dataset):
    def __init__(self, data,  transform=None):
        self.data = data
        self.transform = transform
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        
        index_data = self.data.iloc[index, :-1]
        label = self.data.iloc[index, -1]
        
        if self.transform is not None:
            index_data = self.transform(index_data)
            label = self.transform(label)
        return  index_data, label

In [20]:
training = dataframeDataset(train_data, transform=torch.tensor)

In [21]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size, 2)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        output = self.sigmoid(output)
        return output

In [26]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#cudnn.benchmark = True

N, D_in, H, D_out = 64, 500, 64, 2


params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
max_epochs = 30

mean_train_losses = []
mean_valid_losses = []
valid_acc_list = []

# Load Dataframe by Dataloader
training = dataframeDataset(train_data, transform=torch.tensor)
validation = dataframeDataset(validate_data, transform=torch.tensor)
training_generator = DataLoader(training, **params)
validation_generator = DataLoader(validation, **params)

# Construct our model by instantiating the class defined above
model = MLP(D_in, H).to(device)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for epoch in range(max_epochs):
    train_losses = []
    valid_losses = []
    # Training
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        # Model computations
        optimizer.zero_grad()
        # Forward pass
        y_pred = model(local_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), local_labels)
        train_losses.append(loss.item())
        #print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
        # Backward pass
        loss.backward()
        optimizer.step()
    model.eval()
    correct = 0
    total = 0
    # Validation
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in validation_generator:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)
            y_pred = model(local_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), local_labels)
            
            valid_losses.append(loss.item())
            
            _, predicted = torch.max(y_pred.data, 1)
            correct += (predicted == local_labels).sum().item()
            total += local_labels.size(0)
            
    mean_train_losses.append(np.mean(train_losses))
    mean_valid_losses.append(np.mean(valid_losses))
    
    accuracy = 100*correct/total
    valid_acc_list.append(accuracy)
    print('epoch : {}, train loss : {:.4f}, valid loss : {:.4f}, valid acc : {:.2f}%'\
         .format(epoch+1, np.mean(train_losses), np.mean(valid_losses), accuracy))

epoch : 1, train loss : 0.6720, valid loss : 0.6690, valid acc : 83.14%
epoch : 2, train loss : 0.6664, valid loss : 0.6633, valid acc : 83.29%
epoch : 3, train loss : 0.6608, valid loss : 0.6576, valid acc : 83.33%
epoch : 4, train loss : 0.6550, valid loss : 0.6518, valid acc : 83.33%
epoch : 5, train loss : 0.6491, valid loss : 0.6458, valid acc : 83.33%
epoch : 6, train loss : 0.6430, valid loss : 0.6398, valid acc : 83.33%
epoch : 7, train loss : 0.6369, valid loss : 0.6335, valid acc : 83.33%
epoch : 8, train loss : 0.6306, valid loss : 0.6271, valid acc : 83.33%
epoch : 9, train loss : 0.6243, valid loss : 0.6205, valid acc : 83.33%
epoch : 10, train loss : 0.6179, valid loss : 0.6146, valid acc : 83.33%
epoch : 11, train loss : 0.6114, valid loss : 0.6078, valid acc : 83.33%


KeyboardInterrupt: 

### Using SVM classifier

In [43]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.4)
data = data.sample(frac=1).reset_index(drop=True)

for train_index, validate_index in split.split(data, data['label']):
    train_data = data.loc[train_index]
    validate_data = data.loc[validate_index]


train_label = train_data['label']
train_data.drop('label', axis=1, inplace = True)
train_label.reset_index(drop=True, inplace=True)
train_data.reset_index(drop=True, inplace=True)

validate_label = validate_data['label']
validate_data.drop('label', axis=1, inplace = True)
validate_label.reset_index(drop=True, inplace=True)
validate_data.reset_index(drop=True, inplace=True)

all_train = np.concatenate([train_data, validate_data])
all_label = np.concatenate([train_label, validate_label])

In [44]:
train_label[train_label==1]

8       1
10      1
13      1
16      1
17      1
       ..
7945    1
7968    1
7977    1
7980    1
7982    1
Name: label, Length: 1333, dtype: int64

In [28]:
from sklearn.svm import SVC
clf = SVC(C=1.0, kernel='rbf', gamma=0.1)
clf.fit(train_data, train_label)
clf.score(train_data, train_label)
clf.score(validate_data, validate_label)

0.8333020813800862

### Using tensorflow 2-hidden layer mlp

In [31]:
import tensorflow as tf
import keras

Using TensorFlow backend.


In [45]:
def NN_model_on_train_set(D_in=500, n_neurons_1=150, n_neurons_2 = 70, learning_rate = 0.01, n_epochs = 30, batch_size = 50):
    # here we build a two layers NN model and test on validation set, you may improve it to a CV version
    # n_neurons_1 : number of neurons in the first layer
    # n_neurons_2  : number of neurons in the second layer
    # learning_rate : the learning rate of BGD
    # n_epochs : times of training the model
    # batch_size : since we adopted BGD, then we need to define the size of a size
    # initialize variables
    X = tf.placeholder(tf.float32, shape=(None, D_in), name='X')
    y = tf.placeholder(tf.int64, shape=(None), name = 'y')

    # weights
    W1 = tf.Variable(tf.truncated_normal((D_in, n_neurons_1),stddev = 0.01), name = 'layer_1')
    W2 = tf.Variable(tf.truncated_normal((n_neurons_1, n_neurons_2),stddev = 0.01), name = 'layer_2')
    W3 = tf.Variable(tf.truncated_normal((n_neurons_2 , 2),stddev = 0.01), name = 'output_layer')

    # biases
    b1 = tf.Variable(tf.zeros([n_neurons_1]), name='b_1')
    b2 = tf.Variable(tf.zeros([n_neurons_2]), name='b_2')
    b3 = tf.Variable(tf.zeros([2]), name='b_3')

    # the output of each layer
    Z1 = tf.nn.relu(tf.matmul(X,W1) + b1)
    Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2)
    output = tf.matmul(Z2, W3) + b3

    # define loss function. Cross-entropy was adopted rather than MSE
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = output)
    loss = tf.reduce_mean(xentropy, name='loss')

    # optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

    # define accuracy
    correct = tf.nn.in_top_k(output, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    # run everything
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        init.run()
        for epoch in range(n_epochs):
            for iteration in range(len(train_data) // batch_size):
                # 因为要batchSize=50个50个的取，取到末尾时可能不够，所以用个if判定一下
                if (iteration + 1) * batch_size <= len(train_data):
                    X_batch = np.array(train_data[iteration * batch_size : iteration * batch_size + batch_size])
                    y_batch = np.array(train_label[iteration * batch_size : iteration * batch_size + batch_size])
                else:
                    X_batch = np.array(train_data[iteration * batch_size : ])
                    y_batch = np.array(train_label[iteration * batch_size : ])
                sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
            # train error
            acc_train = accuracy.eval(feed_dict={X:X_batch, y:y_batch})
            # test error
            acc_test = accuracy.eval(feed_dict={X:np.array(validate_data),
                                               y:np.array(validate_label)})
            print(epoch, 'Train accuracy:', acc_train, 'Test accuracy:', acc_test)

In [46]:
NN_model_on_train_set()

0 Train accuracy: 0.84 Test accuracy: 0.8333021
1 Train accuracy: 0.84 Test accuracy: 0.8333021
2 Train accuracy: 0.84 Test accuracy: 0.8333021
3 Train accuracy: 0.84 Test accuracy: 0.8333021
4 Train accuracy: 0.84 Test accuracy: 0.8333021
5 Train accuracy: 0.84 Test accuracy: 0.8333021
6 Train accuracy: 0.84 Test accuracy: 0.8333021
7 Train accuracy: 0.84 Test accuracy: 0.8333021
8 Train accuracy: 0.84 Test accuracy: 0.8333021
9 Train accuracy: 0.84 Test accuracy: 0.8333021
10 Train accuracy: 0.84 Test accuracy: 0.8333021
11 Train accuracy: 0.84 Test accuracy: 0.8333021
12 Train accuracy: 0.84 Test accuracy: 0.8333021
13 Train accuracy: 0.84 Test accuracy: 0.8333021
14 Train accuracy: 0.84 Test accuracy: 0.8333021
15 Train accuracy: 0.84 Test accuracy: 0.8333021


KeyboardInterrupt: 