In [1]:
import sys
sys.path.append('../Data_Feature')
sys.path.append('../Data_processing')
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, make_scorer, confusion_matrix
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
import seaborn as sns
from data_processing import KOProcessor, TraitManager  # Assuming your data processing script is saved as 'data_processing.py'
import networkx as nx
from pyvis.network import Network
from bioservices import KEGG
import multiprocessing
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


c:\Users\eliah\Documents\Master\Eliah-Masters\Pipelines


In [None]:

# Load and preprocess data
terms_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/terms_KO.zip'
terms_csv_path = 'terms_KO.csv'
traits_reduced_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/reducedDataset.zip'
traits_reduced_csv_path = 'reducedDataset.csv'
traits_assembled_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/assembledDataset.zip'
traits_assembled_csv_path = 'assembledDataset.csv'

processor = KOProcessor(
    terms_zip_path, 
    terms_csv_path, 
    traits_reduced_zip_path, 
    traits_reduced_csv_path, 
    traits_assembled_zip_path=traits_assembled_zip_path, 
    traits_assembled_csv_path=traits_assembled_csv_path
)

# Load and preprocess KO terms and traits
trait_column = 'gram'
ko_terms = processor.load_terms()
reduced_traits_data = processor.load_reduced_traits_data()

X_terms = processor.preprocess_terms(ko_terms)
y_traits = processor.preprocess_traits(reduced_traits_data, trait_column=trait_column, use_assembled_if_missing=True)

# Align features and labels
X_aligned, Y_aligned = processor.align_data(X_terms, y_traits)

# Feature Selection: Variance Threshold
selector = VarianceThreshold(threshold=0.01)
X_aligned = selector.fit_transform(X_aligned)

Data loaded successfully:
   Unnamed: 0   key      KO
0           0  1000  K00001
1           1  1000  K13954
2           2  1000  K00003
3           3  1000  K00013
4           4  1000  K00014
Data loaded successfully:
   key        ID                      speciesStrain  \
0    1  592010.0   Abiotrophia defectiva ATCC 49176   
1    1     219.0     Abiotrophia defectiva DSM 9849   
2    2  159837.0       Abyssibacter profundi OUC007   
3    3       NaN  Acanthopleuribacter pedis FYK2218   
4    4  258515.0   Acetanaerobacterium elongatum Z7   

                 speciesStrainComp                genus            genusComp  \
0    abiotrophiadefectivaatcc49176          Abiotrophia          abiotrophia   
1      abiotrophiadefectivadsm9849          Abiotrophia          abiotrophia   
2       abyssibacterprofundiouc007         Abyssibacter         abyssibacter   
3  acanthopleuribacterpedisfyk2218  Acanthopleuribacter  acanthopleuribacter   
4   acetanaerobacteriumelongatumz7  Acetanaerobac

AttributeError: 'NoneType' object has no attribute 'index'

In [None]:
# Updated train_and_evaluate function using Leave-One-Out Cross-Validation
def train_and_evaluate(X_aligned, Y_aligned):
    results = {}
    print(f"Processing trait: {trait_column}")
    
    # Binary labels for the current trait
    Y_current = Y_aligned.values.flatten()
    
    # Define a pipeline
    pipeline = Pipeline([
        ('select_k', SelectKBest(f_classif)),
        ('variance_threshold', VarianceThreshold(threshold=0.0)),
        ('estimator', RandomForestClassifier())
    ])

    # Define a parameter grid to search over
    param_grid = [
        {
            'select_k__k': [10, 100, 1000],
            'estimator': [RandomForestClassifier(random_state=42)],
            'estimator__n_estimators': [100, 200],
            'estimator__max_depth': [5, 10, None]
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator': [SVC(random_state=42)],
            'estimator__C': [0.1, 1, 10],
            'estimator__kernel': ['linear', 'rbf'],
            'estimator__gamma': ['scale', 'auto']
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator': [LogisticRegression(max_iter=1000)],
            'estimator__C': [0.01, 0.1, 1, 10, 100]
        },
        {
            'select_k__k': [10, 100, 1000],
            'estimator': [BernoulliNB()],
            'estimator__alpha': [0.01, 0.1, 1.0, 10.0],
            'estimator__binarize': [0.0]
        }
    ]
    
    # Grid search with Leave-One-Out cross-validation using multiple CPU cores
    loo = LeaveOneOut()
    grid_search = GridSearchCV(pipeline, param_grid, cv=loo, n_jobs=-1, verbose=1)
    grid_search.fit(X_aligned, Y_current)

    print(f"Best parameters found for {trait_column}:", grid_search.best_params_)
    print(f"Best cross-validation score for {trait_column}: {grid_search.best_score_:.3f}")

    # Use cross_val_predict to get predictions
    best_model = grid_search.best_estimator_
    Y_pred = cross_val_predict(best_model, X_aligned, Y_current, cv=loo, n_jobs=-1)

    mcc = matthews_corrcoef(Y_current, Y_pred)
    print(f"Matthews Correlation Coefficient for {trait_column}: {mcc:.3f}")

    # Display confusion matrix
    cm = confusion_matrix(Y_current, Y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'Confusion Matrix for {trait_column} Classifier')
    plt.show()
    
    # Store results
    results[trait_column] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'mcc': mcc,
        'confusion_matrix': cm
    }

    # Print final results
    print(f"Trait: {trait_column}")
    print(f"Best Parameters: {results[trait_column]['best_params']}")
    print(f"Best Cross-Validation Score: {results[trait_column]['best_score']:.3f}")
    print(f"MCC: {results[trait_column]['mcc']:.3f}")
    print(f"Confusion Matrix:\n{results[trait_column]['confusion_matrix']}\n")

    # F1 Score vs. MCC Graph
    f1 = f1_score(Y_current, Y_pred, average='macro')
    plt.figure(figsize=(8, 5))
    plt.bar(['F1 Score', 'MCC'], [f1, mcc], color=['blue', 'green'])
    plt.ylabel('Score')
    plt.title('F1 Score vs. MCC')
    plt.show()

# Run the function with your data
train_and_evaluate(X_aligned, Y_aligned)

In [None]:
# KEGG Pathway Mapping
def map_ko_to_pathways(ko_terms):
    kegg = KEGG()
    pathways = {}
    for ko in ko_terms:
        try:
            gene_links = kegg.link("pathway", ko)
            if gene_links:
                for entry in gene_links.strip().split("\n"):
                    split_entry = entry.split("\t")
                    if len(split_entry) >= 2:
                        ko_id, pathway_id = split_entry[0], split_entry[1]
                        if pathway_id not in pathways:
                            pathways[pathway_id] = set()
                        pathways[pathway_id].add(ko)
        except Exception as e:
            print(f"Error processing {ko}: {e}")
    return pathways

selected_important_features = X_terms.columns[selector.get_support()]
pathways = map_ko_to_pathways(selected_important_features)

# Creating the adjacency matrix with translated KO terms, including original KO term
translated_kos = {ko: f"Translated_{ko}" for ko in selected_important_features}  # Placeholder for actual translation function
pathway_matrix = pd.DataFrame(
    index=[f"{translated_kos[ko]} ({ko})" for ko in selected_important_features],
    columns=pathways.keys(),
    data=0
)
for pathway, kos in pathways.items():
    for ko in kos:
        if ko in selected_important_features:
            pathway_matrix.loc[f"{translated_kos[ko]} ({ko})", pathway] = 1

# Fetch and rename pathway names for readability
kegg = KEGG()
for column in pathway_matrix.columns:
    pathway_info = kegg.get(column)
    parsed_info = kegg.parse(pathway_info)
    pathway_name = parsed_info['NAME'][0] if 'NAME' in parsed_info else column
    pathway_matrix.rename(columns={column: pathway_name}, inplace=True)

print("Pathway matrix after renaming:\n", pathway_matrix)

# Heatmap visualization
sns.heatmap(pathway_matrix, annot=True, cmap="Greys", cbar=False)
plt.title(f'Adjacency Matrix of KO Terms and Pathways ({trait_column})')
plt.xlabel('Pathways')
plt.ylabel('KO Terms')
plt.show()

# Network Visualization
G = nx.Graph()

# Define a list of general pathways to exclude
excluded_pathways = ["metabolic pathways"]  # You can add more general terms here

# Add nodes and edges with renamed pathway names
for ko in selected_important_features:
    translated_label = f"{translated_kos[ko]} ({ko})"
    G.add_node(ko, title=translated_label, label=translated_label, color='red', size=20)

for pathway_id, kos in pathways.items():
    pathway_info = kegg.get(pathway_id)
    parsed_info = kegg.parse(pathway_info)
    pathway_name = parsed_info['NAME'][0] if 'NAME' in parsed_info else pathway_id
    if pathway_name.lower() not in excluded_pathways:
        G.add_node(pathway_name, title=pathway_name, label=pathway_name, color='blue', size=30)
        for ko in kos:
            G.add_edge(ko, pathway_name)

# Pyvis network visualization
nt = Network("800px", "1200px", notebook=True, heading=f'Interactive Network of KO Terms and Pathways ({trait_column})', bgcolor="#ffffff", font_color="black", cdn_resources='remote')
nt.from_nx(G)
nt.toggle_physics(True)
nt.show_buttons(filter_=['physics'])
nt.save_graph(f"ko_network_{trait_column}.html")
