## Feature Selection and KNN

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

def select_features_and_predict(train_data_path, target, processed_proteins, num_best_features, num_neighbors):
    # Load data
    train = pd.read_csv(train_data_path, index_col=0)

    # Set up features
    features = train.columns.tolist()
    features.remove(target)
    matched_set = processed_proteins.intersection(features)
    features = list(matched_set)
    print("Matched items:", len(features))

    best = []
    max_acc = 0
    all_features = features[:]  # Copy to retain the original features for neighbor selection

    # Feature selection loop
    while len(best) < num_best_features:
        max_acc = 0
        remaining_features = list(set(features) - set(best))
        for new_column in remaining_features:
            model = LogisticRegression()
            if best:
                model.fit(train[best + [new_column]], train[target])
                target_predicted = model.predict(train[best + [new_column]])
            else:
                # Handle the case where best is empty (first iteration)
                model.fit(train[[new_column]], train[target])
                target_predicted = model.predict(train[[new_column]])
            acc = metrics.accuracy_score(train[target], target_predicted)
            if acc > max_acc:
                max_acc = acc
                max_column = new_column
                
        best.append(max_column)
        features.remove(max_column)
        print('Best columns:', best)
        print('Accuracy:', max_acc)


    return best

# best_features = select_features_and_predict('data/train.csv', 'group', processed_proteins, 4, 4)

In [None]:


best_features = select_features_and_predict('data/train.csv', 'group', processed_proteins, 4)


In [None]:
indices = map_names_to_tensor_indices(best_features, pipeline, existing_proteins_list)
print("Index of the desired protein in the PyTorch Geometric tensor:", indices)
Z = torch.load('clusterGCN_embedding_relu_normalize.pt').detach().numpy()
best_feature_embedding = Z[indices]

In [None]:
best_features

In [None]:
best_feature_embedding = best_feature_embedding.reshape(best_feature_embedding.shape[0], -1)
# best_feature_embedding = best_feature_embedding.reshape(1, -1)
# best_feature_embedding = normalize(best_feature_embedding)
knn = NearestNeighbors(n_neighbors=3, algorithm='auto', metric='euclidean')
knn.fit(protein_embeddings)


distances, indices = knn.kneighbors(protein_embeddings)
print("Nearest Neighbors Indices:", indices)
print("Distances:", distances)







from sklearn.metrics.pairwise import euclidean_distances

# Calculate the mean similarity score using Euclidean distance
similarity_scores = []

for i, neighbors in enumerate(indices):
    scores = []
    for neighbor_index in neighbors:
        if neighbor_index != i:
            sim_score = 1 / (1 + euclidean_distances([protein_embeddings[i]], [protein_embeddings[neighbor_index]])[0, 0])
            scores.append(sim_score)
    mean_score = np.mean(scores)
    similarity_scores.append(mean_score)

average_similarity = np.mean(similarity_scores)
print(f'Average Similarity Score (Euclidean): {average_similarity:.4f}')


## Load Samples

In [1]:
import pandas as pd

bio_df = pd.read_excel('data/expression_data.xlsx', engine='openpyxl')
bio_df = bio_df.drop(bio_df.columns[0], axis=1)
proteins = bio_df.columns.tolist()
uniprot_to_reactome = pd.read_csv('data/MMU_Uniprot2Reactome.txt', sep='\t')

existing_proteins = uniprot_to_reactome[uniprot_to_reactome['V1'].isin(proteins)]['V1'].unique()

existing_proteins_list = existing_proteins.tolist()
# existing_proteins_list
proteins_df = bio_df[existing_proteins_list[0:30]]

## Evaluation

In [None]:
from utils_functions import *
from sklearn.cluster import AgglomerativeClustering
from train import GAEPipeline
from train import GCNEncoder
model_dir = 'latest_models_normalize/best_models_kmeans_k=3/64/'  # Directory where model files are saved
result_dir = 'evaluation_results'
n_clusters = 3 # Number of clusters for KMeans

pipeline = GAEPipeline(in_channels=15, out_channels=32, sampling_method='clusterGCN', hidden_channels = 20, num_layers = 1, dropout_rate=0.05)
graph = pipeline.load_graph_from_pickle('combined_graph_latest.pkl')

data = pipeline.preprocess_graph(graph)
models = generate_embeddings_from_models(model_dir, data, pipeline)
protein_indices_in_pyg = get_protein_indices_in_pyg(graph, existing_proteins_list)
kmeans = KMeans(n_clusters=n_clusters, random_state=2)
#agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
metrics, emd_metrics,  frobenius_norm_metrics = evaluate_embeddings(models, n_clusters,protein_indices_in_pyg, pipeline, kmeans)



## Graph Data Summary

In [None]:
G = pipeline.load_graph_from_pickle('combined_graph_latest.pkl')


# Number of Nodes (Vertices)
num_nodes = G.number_of_nodes()

# Number of Edges
num_edges = G.number_of_edges()

# Average Degree
degrees = [degree for node, degree in G.degree()]
average_degree = sum(degrees) / len(degrees)

# Graph Density
density = nx.density(G)

# Diameter of Graph
if nx.is_connected(G):
    diameter = nx.diameter(G)
else:
    diameter = float('inf')  # or handle as appropriate


# Is the Graph Directed?
is_directed = G.is_directed()

print(f"Number of Nodes (Vertices): {num_nodes}")
print(f"Number of Edges: {num_edges}")
print(f"Average Degree: {average_degree}")
print(f"Graph Density: {density}")
print(f"Diameter of Graph: {diameter}")
print(f"Is the Graph Directed?: {is_directed}")



In [None]:
print(frobenius_norm_metrics[5])
# df = pd.DataFrame(emd_metrics)
# for index, row in df.iterrows():
#     print(f"EMD between {row['model1']} and {row['model2']}: {row['emd']:.3f}")