In [1]:
import pandas as pd
import networkx as nx
import random

In [2]:
positive_data = pd.read_csv('data/sequence_data.csv')
positive_data.columns = ['protein1', 'protein2']
positive_data.head()

Unnamed: 0,protein1,protein2
0,MAGARSRDPWGASGICYLFGSLLVELLFSRAVAFNLDVMGALRKEG...,MEPWPLLLLFSLCSAGLVLGSEHETRLVAKLFKDYSSVVRPVEDHR...
1,MLKTESSGERTTLRSASPHRNAYRTEFQALKSTFDKPKSDGEQKTK...,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...
2,MMQKLLKCSRLVLALALILVLESSVQGYPTRRARYQWVRCNPDSNS...,MDKFWWHAAWGLCLVPLSLAQIDLNITCRFAGVFHVEKNGRYSISR...
3,MELDLSPPHLSSSPEDLCPAPGTPPGTPRPPDTPLPEEVKRSQPLL...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...
4,MSQRPRAPRSALWLLAPPLLRWAPPLLTVLHSDLFQALLDILDYYE...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...


In [3]:
unique_proteins = set(positive_data['protein1']).union(set(positive_data['protein2']))

In [4]:
positive_graph = nx.from_pandas_edgelist(positive_data, 'protein1', 'protein2')

In [5]:
ng = nx.complement(positive_graph)

In [6]:
ng_data = pd.DataFrame(ng.edges(), columns=['protein1', 'protein2'])

In [7]:
# Random Pair selection (RP)
def random_pair_selection(ng_data, positive_data):
    negative_data = ng_data.sample(n=len(positive_data))
    return negative_data

# Same Degree distribution (SD)
def same_degree_distribution(ng, positive_graph):
    degree_dict = positive_graph.degree()
    ng_degree_sorted = sorted(ng.edges(data=True), key=lambda x: degree_dict[x[0]] + degree_dict[x[1]], reverse=True)
    negative_data = pd.DataFrame(ng_degree_sorted[:len(positive_graph.edges())], columns=['protein1', 'protein2', 'data']).drop(columns=['data'])
    return negative_data

# Feature extraction: sum of degrees for each protein pair
def feature_extraction(ng_data, positive_graph):
    degree_dict = positive_graph.degree()
    features = ng_data.apply(lambda row: degree_dict[row['protein1']] + degree_dict[row['protein2']], axis=1)
    return features

# Most Close (MC) and Most Distant (MD) selection
def most_close_distant(ng_data, positive_data, positive_graph, strategy='close'):
    features = feature_extraction(ng_data, positive_graph)
    mean_feature = features.mean()
    
    if strategy == 'close':
        ng_data['distance'] = abs(features - mean_feature)
        ng_data_sorted = ng_data.sort_values(by='distance', ascending=True)
    elif strategy == 'distant':
        ng_data['distance'] = abs(features - mean_feature)
        ng_data_sorted = ng_data.sort_values(by='distance', ascending=False)
    else:
        raise ValueError("Invalid strategy. Choose 'close' or 'distant'.")
        
    negative_data = ng_data_sorted.head(len(positive_data)).drop(columns=['distance'])
    return negative_data

In [9]:
#negative_data_rp = random_pair_selection(ng_data, positive_data)
negative_data_sd = same_degree_distribution(ng, positive_graph)
#negative_data_mc = most_close_distant(ng_data, positive_data, positive_graph, strategy='close')
#negative_data_md = most_close_distant(ng_data, positive_data, positive_graph, strategy='distant')

In [11]:
negative_data_sd.head()

Unnamed: 0,protein1,protein2
0,MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADG...,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
1,MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELNGKDG...,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
2,MAENVVEPGPPSAKRPKLSSPALSASASDGTDFGSLFDLEHDLPDE...,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
3,MAENLLDGPPNPKRAKLSSPGFSANDSTDFGSLFDLENDLPDELIP...,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
4,MEAAVAAPRPRLLLLVLAAAAAAAAALLPGATALQCFCHLCTKDNF...,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...


In [16]:
negative_data_sd.to_csv('data/negative_data.csv', index=False)