In [3]:
import sys
sys.path.append('../Data_Feature')
sys.path.append('../Data_processing')
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, make_scorer, confusion_matrix
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
import seaborn as sns
from data_processing import KOProcessor  # Assuming your data processing script is saved as 'data_processing.py'
import networkx as nx
from pyvis.network import Network
from bioservices import KEGG
import multiprocessing
import warnings

In [None]:
# Load and preprocess data
terms_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/terms_KO.zip'
terms_csv_path = 'terms_KO.csv'
traits_reduced_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/reducedDataset.zip'
traits_reduced_csv_path = 'reducedDataset.csv'
traits_assembled_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/assembledDataset.zip'
traits_assembled_csv_path = 'assembledDataset.csv'

processor = KOProcessor(
    terms_zip_path, 
    terms_csv_path, 
    traits_reduced_zip_path, 
    traits_reduced_csv_path, 
    traits_assembled_zip_path=traits_assembled_zip_path, 
    traits_assembled_csv_path=traits_assembled_csv_path
)

# Load and preprocess KO terms and traits
target_trait = "trophy"

ko_terms = processor.load_terms()
if ko_terms is None:
        raise FileNotFoundError("KO terms could not be loaded. Please check the file paths.")

reduced_traits_data = processor.load_reduced_traits_data()
if reduced_traits_data is None:
    raise FileNotFoundError("Reduced traits data could not be loaded. Please check the file paths.")

# Debug: Print columns of reduced_traits_data
print("Columns in reduced_traits_data:", reduced_traits_data.columns.tolist())

# Uses assembled dataset if data not in reduced
traits_assembled = processor.load_assembled_traits_data()
if traits_assembled is not None:
    print("Columns in assembled_traits_data:", traits_assembled.columns.tolist())

#Feature_preprocess = processor.preprocess_features(ko_terms,reduced_traits_data)

# Preprocess KO terms and traits (trophy, gram, oxygen)
X_terms = processor.preprocess_terms(ko_terms)
y_traits = processor.preprocess_traits(reduced_traits_data, trait_column=target_trait, use_assembled_if_missing=True)

# Check if y_traits was processed correctly
if y_traits is None:
    raise ValueError(f"Traits data for {target_trait} could not be processed. Please check the log for errors.")

# Align features and labels
X_aligned, Y_aligned = processor.align_data(X_terms, y_traits)

# Feature Selection: Variance Threshold
selector = VarianceThreshold(threshold=0.04)
X_aligned = selector.fit_transform(X_aligned)

Data loaded successfully:
   Unnamed: 0   key      KO
0           0  1000  K00001
1           1  1000  K13954
2           2  1000  K00003
3           3  1000  K00013
4           4  1000  K00014
Data loaded successfully:
   key        ID                      speciesStrain  \
0    1  592010.0   Abiotrophia defectiva ATCC 49176   
1    1     219.0     Abiotrophia defectiva DSM 9849   
2    2  159837.0       Abyssibacter profundi OUC007   
3    3       NaN  Acanthopleuribacter pedis FYK2218   
4    4  258515.0   Acetanaerobacterium elongatum Z7   

                 speciesStrainComp                genus            genusComp  \
0    abiotrophiadefectivaatcc49176          Abiotrophia          abiotrophia   
1      abiotrophiadefectivadsm9849          Abiotrophia          abiotrophia   
2       abyssibacterprofundiouc007         Abyssibacter         abyssibacter   
3  acanthopleuribacterpedisfyk2218  Acanthopleuribacter  acanthopleuribacter   
4   acetanaerobacteriumelongatumz7  Acetanaerobac

In [6]:
# Updated train_and_evaluate function using Leave-One-Out Cross-Validation for multilabel
from sklearn.multioutput import MultiOutputClassifier

def train_and_evaluate(X_aligned, Y_aligned):
    results = {}
    
    # Define a pipeline for multilabel classification
    pipeline = Pipeline([
        ('select_k', SelectKBest(f_classif)),
        ('variance_threshold', VarianceThreshold(threshold=0.0)),
        ('estimator', MultiOutputClassifier(RandomForestClassifier()))
    ])

    # Define a parameter grid to search over
    param_grid = [
        {
            'select_k__k': [10, 100, 1000],
            'estimator__estimator': [RandomForestClassifier(random_state=42)],
            'estimator__estimator__n_estimators': [100, 200],
            'estimator__estimator__max_depth': [5, 10, None]
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator__estimator': [SVC(random_state=42)],
            'estimator__estimator__C': [0.1, 1, 10],
            'estimator__estimator__kernel': ['linear', 'rbf'],
            'estimator__estimator__gamma': ['scale', 'auto']
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator__estimator': [LogisticRegression(max_iter=1000)],
            'estimator__estimator__C': [0.1, 1, 10.0]
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator__estimator': [BernoulliNB()],
            'estimator__estimator__alpha': [ 0.1, 1.0, 10.0],
            'estimator__estimator__binarize': [0.0]
        }
    ]
    
    # Grid search with Leave-One-Out cross-validation using multiple CPU cores
    loo = LeaveOneOut()
    grid_search = GridSearchCV(pipeline, param_grid, cv=loo, n_jobs=multiprocessing.cpu_count(), verbose=1)
    grid_search.fit(X_aligned, Y_aligned)

    print("Best parameters found for multilabel classification:", grid_search.best_params_)
    print("Best cross-validation score for multilabel classification: {:.3f}".format(grid_search.best_score_))

    # Use cross_val_predict to get predictions
    best_model = grid_search.best_estimator_
    Y_pred = cross_val_predict(best_model, X_aligned, Y_aligned, cv=loo, n_jobs=multiprocessing.cpu_count())

    f1_scores = []
    mcc_scores = []

    # Evaluate performance for each trophic level
    for i, trophic_level in enumerate(Y_aligned.columns):
        Y_true = Y_aligned.iloc[:, i]
        Y_trait_pred = Y_pred[:, i]
        mcc = matthews_corrcoef(Y_true, Y_trait_pred)
        f1 = f1_score(Y_true, Y_trait_pred, average='macro')
        f1_scores.append(f1)
        mcc_scores.append(mcc)
        print(f"Trophic Level: {trophic_level}")
        print(f"Matthews Correlation Coefficient: {mcc:.3f}")
        print(f"F1 Score: {f1:.3f}")

        # Display confusion matrix
        cm = confusion_matrix(Y_true, Y_trait_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
                    xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.title(f'Confusion Matrix for {trophic_level} Classifier')
        plt.show()

        # Store results
        results[trophic_level] = {
            'mcc': mcc,
            'f1_score': f1,
            'confusion_matrix': cm
        }

    # Print final results
    for trophic_level, result in results.items():
        print(f"Trophic Level: {trophic_level}")
        print(f"MCC: {result['mcc']:.3f}")
        print(f"F1 Score: {result['f1_score']:.3f}")
        print(f"Confusion Matrix:\n{result['confusion_matrix']}\n")

    # F1 Score vs. MCC Graph
    plt.figure(figsize=(10, 6))
    x = np.arange(len(Y_aligned.columns))
    width = 0.35
    plt.bar(x - width/2, f1_scores, width, label='F1 Score', color='blue')
    plt.bar(x + width/2, mcc_scores, width, label='MCC', color='green')
    plt.xlabel('Traits')
    plt.ylabel('Score')
    plt.title('F1 Score vs. MCC for each Trait')
    plt.xticks(x, Y_aligned.columns, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Run the function with your data
train_and_evaluate(X_aligned, Y_aligned)


Fitting 594 folds for each of 72 candidates, totalling 42768 fits


KeyboardInterrupt: 

In [None]:

# KEGG Pathway Mapping
def map_ko_to_pathways(ko_terms):
    kegg = KEGG()
    pathways = {}
    for ko in ko_terms:
        try:
            gene_links = kegg.link("pathway", ko)
            if gene_links:
                for entry in gene_links.strip().split("\n"):
                    split_entry = entry.split("\t")
                    if len(split_entry) >= 2:
                        ko_id, pathway_id = split_entry[0], split_entry[1]
                        if pathway_id not in pathways:
                            pathways[pathway_id] = set()
                        pathways[pathway_id].add(ko)
        except Exception as e:
            print(f"Error processing {ko}: {e}")
    return pathways

selected_important_features = X_terms.columns[selector.get_support()]
pathways = map_ko_to_pathways(selected_important_features)

# Creating the adjacency matrix with translated KO terms, including original KO term
translated_kos = {ko: f"Translated_{ko}" for ko in selected_important_features}  # Placeholder for actual translation function
pathway_matrix = pd.DataFrame(
    index=[f"{translated_kos[ko]} ({ko})" for ko in selected_important_features],
    columns=pathways.keys(),
    data=0
)
for pathway, kos in pathways.items():
    for ko in kos:
        if ko in selected_important_features:
            pathway_matrix.loc[f"{translated_kos[ko]} ({ko})", pathway] = 1

# Fetch and rename pathway names for readability
kegg = KEGG()
for column in pathway_matrix.columns:
    pathway_info = kegg.get(column)
    parsed_info = kegg.parse(pathway_info)
    pathway_name = parsed_info['NAME'][0] if 'NAME' in parsed_info else column
    pathway_matrix.rename(columns={column: pathway_name}, inplace=True)

print("Pathway matrix after renaming:\n", pathway_matrix)

# Heatmap visualization
sns.heatmap(pathway_matrix, annot=True, cmap="Greys", cbar=False)
plt.title(f'Adjacency Matrix of KO Terms and Pathways (Multilabel)')
plt.xlabel('Pathways')
plt.ylabel('KO Terms')
plt.show()

# Network Visualization
G = nx.Graph()

# Define a list of general pathways to exclude
excluded_pathways = ["metabolic pathways"]  # You can add more general terms here

# Add nodes and edges with renamed pathway names
for ko in selected_important_features:
    translated_label = f"{translated_kos[ko]} ({ko})"
    G.add_node(ko, title=translated_label, label=translated_label, color='red', size=20)

for pathway_id, kos in pathways.items():
    pathway_info = kegg.get(pathway_id)
    parsed_info = kegg.parse(pathway_info)
    pathway_name = parsed_info['NAME'][0] if 'NAME' in parsed_info else pathway_id
    if pathway_name.lower() not in excluded_pathways:
        G.add_node(pathway_name, title=pathway_name, label=pathway_name, color='blue', size=30)
        for ko in kos:
            G.add_edge(ko, pathway_name)

# Pyvis network visualization
nt = Network("800px", "1200px", notebook=True, heading=f'Interactive Network of KO Terms and Pathways (Multilabel)', bgcolor="#ffffff", font_color="black", cdn_resources='remote')
nt.from_nx(G)
nt.toggle_physics(True)
nt.show_buttons(filter_=['physics'])
nt.save_graph
