In [1]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import random
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- 1. Load Data ---
try:
    train_df = pd.read_csv(r'C:\Users\Dspike\Documents\NTUST\2ndsemester\Social Media Analytics\train.csv')
    test_df = pd.read_csv(r'C:\Users\Dspike\Documents\NTUST\2ndsemester\Social Media Analytics\test.csv')
    print("Files loaded successfully.")
except FileNotFoundError:
    print("Error: training.csv or test.csv not found. Creating dummy data for demonstration...")
    train_df = pd.DataFrame({
        'node1': [0, 0, 1, 1, 2, 3, 4, 6],
        'node2': [1, 2, 3, 4, 5, 4, 5, 7]
    })
    test_df = pd.DataFrame({
        'node1': [0, 1, 2, 5, 6, 7, 0, 3],
        'node2': [3, 5, 4, 7, 0, 1, 6, 6]
    })
    test_df['predict_nodepair_id'] = range(len(test_df))

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# --- 2. Build Graph ---
G = nx.Graph()
G.add_edges_from(train_df[['node1', 'node2']].values)
print(f"Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# Print graph statistics
print("Average degree:", np.mean([d for _, d in G.degree()]))
print("Clustering coefficient:", nx.average_clustering(G))

# List of all nodes in the training graph
nodes_list = list(G.nodes())
num_nodes = len(nodes_list)

# --- 3. Generate Negative Training Samples ---
num_positive_samples = len(train_df)
num_negative_samples = 2 * num_positive_samples  # 2:1 negative:positive ratio

negative_samples = []
existing_edges = set(G.edges()) | set([(v, u) for u, v in G.edges()])  # Include both directions

while len(negative_samples) < num_negative_samples:
    u, v = random.sample(nodes_list, 2)
    if (u, v) not in existing_edges and (v, u) not in existing_edges:
        negative_samples.append((u, v))
    if len(negative_samples) % 1000 == 0:
        print(f"Generated {len(negative_samples)} negative samples...")
    if len(negative_samples) >= num_nodes * (num_nodes - 1) // 2 - G.number_of_edges():
        print("Warning: Approaching max possible non-edges. Stopping negative sampling.")
        break

print(f"Generated {len(negative_samples)} negative samples.")

# --- 4. Feature Engineering Function ---
def calculate_features(u, v, graph):
    features = {}
    try:
        # Jaccard Coefficient
        jaccard_scores = list(nx.jaccard_coefficient(graph, [(u, v)]))
        features['jaccard'] = jaccard_scores[0][2] if jaccard_scores else 0

        # Adamic-Adar Index
        adamic_adar_scores = list(nx.adamic_adar_index(graph, [(u, v)]))
        features['adamic_adar'] = adamic_adar_scores[0][2] if adamic_adar_scores else 0
        '''
        # Preferential Attachment
        preferential_scores = list(nx.preferential_attachment(graph, [(u, v)]))
        features['pref_attachment'] = preferential_scores[0][2] if preferential_scores else 0
        '''
        # Common Neighbors
        common_neighbors = list(nx.common_neighbors(graph, u, v))
        features['common_neighbors'] = len(common_neighbors)
        '''
        # Resource Allocation Index
        features['resource_allocation'] = sum(1 / graph.degree(w) for w in common_neighbors if graph.degree(w) > 0)
        '''
        '''
        # Shortest Path Length (if path exists)
        try:
            features['shortest_path'] = nx.shortest_path_length(graph, u, v)
        except nx.NetworkXNoPath:
            features['shortest_path'] = num_nodes  # Large value if no path
        '''
        '''
        # Degree Centrality Difference
        features['degree_diff'] = abs(graph.degree(u) - graph.degree(v))
        '''
        # Node Degrees
        features['degree_u'] = graph.degree(u)
        features['degree_v'] = graph.degree(v)

    except nx.NetworkXError:
        features['jaccard'] = 0
        features['adamic_adar'] = 0
        features['common_neighbors'] = 0
        features['degree_u'] = 0
        features['degree_v'] = 0

    return features

# --- 5. Prepare Training Data ---
X_train_list = []
y_train = []

# Positive examples
print("Calculating features for positive training examples...")
for _, row in train_df.iterrows():
    u, v = row['node1'], row['node2']
    if G.has_node(u) and G.has_node(v):
        X_train_list.append(calculate_features(u, v, G))
        y_train.append(1)

# Negative examples
print("Calculating features for negative training examples...")
for u, v in negative_samples:
    if G.has_node(u) and G.has_node(v):
        X_train_list.append(calculate_features(u, v, G))
        y_train.append(0)

X_train_df = pd.DataFrame(X_train_list)
y_train = np.array(y_train)
print("Training features prepared.")
print(X_train_df.head())
print(f"Training features shape: {X_train_df.shape}")
print(f"Training labels length: {len(y_train)}")

# --- 6. Split Data into Training and Validation Sets ---
print("Splitting data into 70% training and 30% validation...")
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_df, y_train, test_size=0.3, random_state=42, stratify=y_train
)
print(f"Training split shape: {X_train_split.shape}")
print(f"Validation split shape: {X_val_split.shape}")

# --- 7. Train Classifier on Split Data ---
if len(X_train_split) != len(y_train_split):
    print(f"Error: Mismatch between training features ({len(X_train_split)}) and labels ({len(y_train_split)}).")
else:
    print("Training XGBClassifier on 70% training split...")
    classifier = XGBClassifier(
        n_estimators=400,
        objective='binary:logistic',
        learning_rate=0.15,
        max_depth=5,  # Reduced to prevent overfitting
        min_child_weight=7,  # Increased regularization
        random_state=42,
        n_jobs=-1
    )
    classifier.fit(X_train_split, y_train_split)
    
    # Calculate training metrics
    print("Calculating training metrics...")
    train_predictions = classifier.predict(X_train_split)
    train_accuracy = accuracy_score(y_train_split, train_predictions)
    train_f1 = f1_score(y_train_split, train_predictions)
    train_auc = roc_auc_score(y_train_split, classifier.predict_proba(X_train_split)[:, 1])
    print(f"Training accuracy: {train_accuracy:.4f}")
    print(f"Training F1-score: {train_f1:.4f}")
    print(f"Training AUC: {train_auc:.4f}")
    
    # Calculate validation metrics
    print("Calculating validation metrics...")
    val_predictions = classifier.predict(X_val_split)
    val_accuracy = accuracy_score(y_val_split, val_predictions)
    val_f1 = f1_score(y_val_split, val_predictions)
    val_auc = roc_auc_score(y_val_split, classifier.predict_proba(X_val_split)[:, 1])
    print(f"Validation accuracy: {val_accuracy:.4f}")
    print(f"Validation F1-score: {val_f1:.4f}")
    print(f"Validation AUC: {val_f1:.4f}")
    
    # Print feature importances
    print("Feature importances:", dict(zip(X_train_df.columns, classifier.feature_importances_)))

# --- 8. Cross-Validation on Full Training Data ---
print("Performing 5-fold cross-validation on full training data...")
classifier_cv = XGBClassifier(
    n_estimators=400,
    objective='binary:logistic',
    learning_rate=0.15,
    max_depth=5,
    min_child_weight=7,
    random_state=42,
    n_jobs=-1
)
cv_scores = cross_val_score(classifier_cv, X_train_df, y_train, cv=5, scoring='accuracy')
cv_f1_scores = cross_val_score(classifier_cv, X_train_df, y_train, cv=5, scoring='f1')
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"Cross-validation F1-score: {np.mean(cv_f1_scores):.4f} ± {np.std(cv_f1_scores):.4f}")

# --- 9. Train Ensemble for Kaggle Submission ---
print("Training ensemble (XGBoost + RandomForest) on full training data...")
xgb_classifier = XGBClassifier(
    n_estimators=400,
    objective='binary:logistic',
    learning_rate=0.15,
    max_depth=5,
    min_child_weight=7,
    random_state=42,
    n_jobs=-1
)
rf_classifier = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_leaf=7,
    random_state=42,
    n_jobs=-1
)
xgb_classifier.fit(X_train_df, y_train)
rf_classifier.fit(X_train_df, y_train)
print("Ensemble classifiers trained.")

# --- 10. Prepare Test Data ---
print("Calculating features for test examples...")
X_test_list = []
for _, row in test_df.iterrows():
    u, v = row['node1'], row['node2']
    if G.has_node(u) and G.has_node(v):
        X_test_list.append(calculate_features(u, v, G))
    else:
        X_test_list.append({
            'jaccard': 0,
            'adamic_adar': 0,
            'common_neighbors': 0,
            'degree_u': 0,
            'degree_v': 0
        })

X_test_df = pd.DataFrame(X_test_list)
print("Test features prepared.")
print(f"Test features shape: {X_test_df.shape}")

# --- 11. Predict on Test Data (Ensemble) ---
print("Predicting on test data with ensemble...")
xgb_probs = xgb_classifier.predict_proba(X_test_df)[:, 1]
rf_probs = rf_classifier.predict_proba(X_test_df)[:, 1]
ensemble_probs = (xgb_probs + rf_probs) / 2
predictions = (ensemble_probs > 0.5).astype(int)

# --- 12. Format Output ---
print("Formatting submission file...")
if 'predict_nodepair_id' not in test_df.columns:
    test_df['predict_nodepair_id'] = range(len(test_df))

submission_df = pd.DataFrame({
    'predict_nodepair_id': test_df['predict_nodepair_id'],
    'ans': predictions
})

output_filename = 'submission.csv'
submission_df.to_csv(output_filename, index=False)
print(f"Submission file '{output_filename}' created successfully.")
print(submission_df.head())

Files loaded successfully.
Training data shape: (21090, 2)
Test data shape: (8000, 3)
Graph built with 10980 nodes and 21090 edges.
Average degree: 3.841530054644809
Clustering coefficient: 0.13466795034502793
Generated 1000 negative samples...
Generated 2000 negative samples...
Generated 3000 negative samples...
Generated 4000 negative samples...
Generated 5000 negative samples...
Generated 6000 negative samples...
Generated 7000 negative samples...
Generated 8000 negative samples...
Generated 9000 negative samples...
Generated 10000 negative samples...
Generated 11000 negative samples...
Generated 12000 negative samples...
Generated 13000 negative samples...
Generated 14000 negative samples...
Generated 15000 negative samples...
Generated 16000 negative samples...
Generated 17000 negative samples...
Generated 18000 negative samples...
Generated 19000 negative samples...
Generated 20000 negative samples...
Generated 21000 negative samples...
Generated 22000 negative samples...
Generat