In [1]:
import sys
import warnings
import os
import seaborn as sns
from scipy.stats import pearsonr
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, KFold, LeaveOneOut, cross_val_predict
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, make_scorer, matthews_corrcoef, roc_curve, auc
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import plotly.figure_factory as ff
import networkx as nx
from pyvis.network import Network
import zipfile 
from bioservices import KEGG
from K_func import translate_ko_terms
import numpy as np
from data_processing import KOProcessor  # Assuming your data processing script is saved as 'data_processing.py

ModuleNotFoundError: No module named 'K_func'

In [None]:

# Load and preprocess data
terms_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/terms_KO.zip'
terms_csv_path = 'terms_KO.csv'
traits_reduced_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/reducedDataset.zip'
traits_reduced_csv_path = 'reducedDataset.csv'
traits_assembled_zip_path = 'C:/Users/eliah/Documents/Master/Eliah-Masters/Datasets/assembledDataset.zip'
traits_assembled_csv_path = 'assembledDataset.csv'

processor = KOProcessor(
    terms_zip_path, 
    terms_csv_path, 
    traits_reduced_zip_path, 
    traits_reduced_csv_path, 
    traits_assembled_zip_path=traits_assembled_zip_path, 
    traits_assembled_csv_path=traits_assembled_csv_path
)

# Load and preprocess KO terms and traits
target_trait = 'trophy'
ko_terms = processor.load_terms()
reduced_traits_data = processor.load_reduced_traits_data()

X_terms = processor.preprocess_terms(ko_terms)
y_traits = processor.preprocess_traits(reduced_traits_data, trait_column=target_trait, use_assembled_if_missing=True)

# Align features and labels
X, y = processor.align_data(X_terms, y_traits)

# Feature Selection: Variance Threshold
selector = VarianceThreshold(threshold=0.04)
X = selector.fit_transform(X)

# Define the model and parameter grid
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Train and evaluate model with Leave-One-Out Cross-Validation
def train_and_evaluate(X, y, model, param_grid):
    loo = LeaveOneOut()
    best_models = []
    y_true, y_pred = [], []

    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        grid_search = GridSearchCV(model, param_grid, scoring=make_scorer(f1_score, average='macro'), cv=3, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        y_pred.append(best_model.predict(X_test)[0])
        y_true.append(y_test.values[0])

        best_models.append(best_model)

    return y_true, y_pred, best_models

y_true, y_pred, best_models = train_and_evaluate(X, y, model, param_grid)

# Calculate F1 Score and MCC
f1 = f1_score(y_true, y_pred, average='macro')
mcc = matthews_corrcoef(y_true, y_pred)
print(f'F1 Score: {f1}')
print(f'Matthews Correlation Coefficient: {mcc}')

# Model-Based Feature Importance
feature_importances = np.mean([model.feature_importances_ for model in best_models], axis=0)
feature_names = X_terms.columns[selector.get_support()]
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title('Top 20 Features by Importance')
plt.show()

# KEGG Pathway Mapping (Assuming you have a KEGG mapping function)
# This part will require your specific KEGG database or mapping function
# def map_to_kegg(features):
#     # Placeholder for KEGG mapping
#     pass

# F1 Score vs. MCC Graph
plt.figure(figsize=(8, 5))
plt.bar(['F1 Score', 'MCC'], [f1, mcc], color=['blue', 'green'])
plt.ylabel('Score')
plt.title('F1 Score vs. MCC')
plt.show()
