In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


# 训练集格式：source node ID, target node ID, label (1: 边存在，0: 边不存在)
train_df = pd.read_csv("train.txt", sep=" ", header=None, names=["node1", "node2", "label"])
# 测试集格式：source node ID, target node ID
test_df = pd.read_csv("test.txt", sep=" ", header=None, names=["node1", "node2"])







In [2]:
import pandas as pd
import numpy as np
import networkx as nx
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


node_info = pd.read_csv("node_information.csv", header=None)


# First column is ID，other 932 are features
node_info.columns = ["node_id"] + [f"f{i}" for i in range(1, node_info.shape[1])]
node_features = {}
for _, row in node_info.iterrows():
    node_features[row["node_id"]] = row[1:].values.astype(float)

In [3]:
G = nx.Graph()
positive_edges = train_df[train_df["label"] == 1][["node1", "node2"]].values.tolist()
G.add_edges_from(positive_edges)


In [4]:
def edge_features(n1, n2):
    # Get node features (use all-zero vector if node is missing)
    sample = next(iter(node_features.values()))
    f1 = node_features.get(n1, np.zeros(len(sample)))
    f2 = node_features.get(n2, np.zeros(len(sample)))

    # basic features
    diff = np.abs(f1 - f2)
    prod = f1 * f2
    summ = f1 + f2
    euclidean = np.array([np.linalg.norm(f1 - f2)])
    manhattan = np.array([np.sum(np.abs(f1 - f2))])
    norm_f1 = np.linalg.norm(f1)
    norm_f2 = np.linalg.norm(f2)
    cosine = np.array([np.dot(f1, f2) / (norm_f1 * norm_f2 + 1e-8)])
    base_feat = np.concatenate([diff, prod, summ, euclidean, manhattan, cosine])

    # Graph structural features
    degree1 = G.degree(n1) if n1 in G else 0
    degree2 = G.degree(n2) if n2 in G else 0
    # common neighbors
    common_neighbors = len(list(nx.common_neighbors(G, n1, n2))) if (n1 in G and n2 in G) else 0
    # Jaccard coefficient
    jaccard_gen = nx.jaccard_coefficient(G, [(n1, n2)])
    jaccard_score = next(jaccard_gen)[2] if jaccard_gen is not None else 0
    graph_feat = np.array([degree1, degree2, common_neighbors, jaccard_score])

    # Join all features
    return np.concatenate([base_feat, graph_feat])

# Generate edge features for training and test sets
X_all = np.array([edge_features(row.node1, row.node2) for _, row in train_df.iterrows()])
y_all = train_df["label"].values
X_test_all = np.array([edge_features(row.node1, row.node2) for _, row in test_df.iterrows()])


In [5]:
print(X_all.shape)

(10496, 2803)


In [8]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

print("X_all shape:", X_all.shape)  # (10496, 2803)
print("y_all shape:", y_all.shape)  # (10496,)

print("X_test_all shape:", X_test_all.shape)

X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print("X_val shape:", X_val.shape)      # 20% val set

# Build XGBoost DMatrix
dtrain = xgb.DMatrix(X_all, label=y_all) # use all train data to train our model
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test_all)  

print("dval 创建成功")

# XGBoost model
params = {
    "objective": "binary:logistic",  
    "eval_metric": "auc",            
    "eta": 0.01,                       
    "max_depth": 6,                   
    "subsample": 0.8,                  
    "colsample_bytree": 0.8,           
    "seed": 42
}

# Train XGBoost model
watchlist = [(dtrain, "train"), (dval, "eval")]
bst = xgb.train(params, dtrain, num_boost_round=200, evals=watchlist, early_stopping_rounds=10)

# Predict
y_pred_val = bst.predict(dval)
y_pred_val_labels = (y_pred_val > 0.5).astype(int)

accuracy = accuracy_score(y_val, y_pred_val_labels)
auc_score = roc_auc_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation AUC: {auc_score:.4f}")

# Predict
y_test_pred = bst.predict(dtest)

submission = pd.DataFrame({
    "ID": range(len(y_test_pred)),
    "Predicted": y_test_pred
})

# save the results
submission.to_csv("submission1.csv", index=False)
print("Test predictions saved to submission1.csv!")


X_all shape: (10496, 2803)
y_all shape: (10496,)
X_test_all shape: (3498, 2803)
X_train shape: (8396, 2803)
X_val shape: (2100, 2803)
dval 创建成功
[0]	train-auc:0.80631	eval-auc:0.82084
[1]	train-auc:0.84534	eval-auc:0.86039
[2]	train-auc:0.84312	eval-auc:0.85814
[3]	train-auc:0.84853	eval-auc:0.86451
[4]	train-auc:0.85114	eval-auc:0.86686
[5]	train-auc:0.85174	eval-auc:0.86751
[6]	train-auc:0.85241	eval-auc:0.86820
[7]	train-auc:0.85167	eval-auc:0.86760
[8]	train-auc:0.85267	eval-auc:0.86888
[9]	train-auc:0.85299	eval-auc:0.86900
[10]	train-auc:0.85346	eval-auc:0.86931
[11]	train-auc:0.85392	eval-auc:0.86992
[12]	train-auc:0.85413	eval-auc:0.87033
[13]	train-auc:0.85422	eval-auc:0.87061
[14]	train-auc:0.85395	eval-auc:0.87031
[15]	train-auc:0.85446	eval-auc:0.87079
[16]	train-auc:0.85470	eval-auc:0.87100
[17]	train-auc:0.85456	eval-auc:0.87069
[18]	train-auc:0.85460	eval-auc:0.87062
[19]	train-auc:0.85431	eval-auc:0.87044
[20]	train-auc:0.85430	eval-auc:0.87049
[21]	train-auc:0.85450	eva