In [22]:
import pandas as pd
# from graph_part import train_test_validation_split, stratified_k_fold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score, matthews_corrcoef, log_loss, mean_squared_error, balanced_accuracy_score
from math import sqrt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight

def matthews(y_true, y_pred):
    """
    Calculate the Matthews Correlation Coefficient and other metrics.
    """
    if type(y_true) == pd.Series:
        y_true = y_true.values

    P = len([x for x in y_true if x == 1])
    N = len([x for x in y_true if x == 0])

    Tp, Fp = 0, 0
    for i in range(len(y_true)):
        if y_true[i] == 1 and y_pred[i] == 1: Tp += 1
        elif y_true[i] == 0 and y_pred[i] == 1: Fp += 1

    Tn = N - Fp
    Fn = P - Tp

    try:
        mcc = (Tp * Tn - Fp * Fn) / sqrt(
            (Tn + Fn) * (Tn + Fp) * (Tp + Fn) * (Tp + Fp))
    except ZeroDivisionError:
        mcc = 0

    return (mcc, f" \n \
    P: {P:_} \n \
    Tp: {Tp:_} \n \
    Fp: {Fp:_} \n \
    N: {N:_} \n \
    Tn: {Tn:_} \n \
    Fn: {Fn:_}")


In [26]:
df = pd.read_csv("../data/processed/model_data.csv")

print(df.shape)
df.head()

(11156, 14)


Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
0,MN335248,negative,7045,False,60.298,13,84.615385,15.384615,88.828957,0,False,False,True,False
1,MK250029,negative,540217,True,25.796,830,47.108434,52.891566,68.324951,30,True,False,False,False
2,MK250028,negative,550053,True,26.012,859,52.270081,47.729919,69.188424,29,True,False,False,False
3,MK250027,negative,551627,True,26.022,860,53.023256,46.976744,69.318761,33,True,False,False,False
4,MK250026,negative,550702,True,26.02,859,53.201397,46.798603,69.363285,33,True,False,False,False


In [None]:
df['staining'].value_counts()

staining
negative    7932
positive    3224
Name: count, dtype: int64

Checking why df and df2 are different

In [23]:
df2 = pd.read_csv("../data/processed/model_data2.csv", index_col=0)

print(df2.shape)
df2.head()
print(df2.columns)

(11154, 15)
Index(['staining', 'genome_length', 'jumbophage', 'gc_%', 'trna_count',
       'cds_number', 'coding_capacity', 'positive_strand_%',
       'negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA',
       'molecule_type_RNA', 'molecule_type_ss-RNA', 'topology_circular',
       'topology_linear'],
      dtype='object')


In [30]:
not_in_df = df[~df['Accession'].isin(df2['id'])]
not_in_df


Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
11080,MPU46938,positive,15664,False,69.344,25,100.0,0.0,91.183606,0,True,False,False,False
11122,L67CG,positive,22195,False,35.99,39,53.846154,46.153846,92.507321,0,True,False,False,False
11126,BL5GENM,positive,2435,False,33.265,4,100.0,0.0,51.252567,0,True,False,False,False


# Homology partitioning (Graph-Part)

In [None]:
sequences = pd.read_csv('../data/interim/feature_engineering/feature_engineering_genomes.csv')
sequences = sequences[['id', 'sequence']]
df.reset_index()
df = df.merge(sequences, left_on='Accession', right_on = 'id', how='left')
df.head()

Unnamed: 0,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA,id,sequence
0,negative,7045,False,60.298,13,84.615385,15.384615,88.828957,0,False,False,True,False,MN335248,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...
1,negative,540217,True,25.796,830,47.108434,52.891566,68.324951,30,True,False,False,False,MK250029,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...
2,negative,550053,True,26.012,859,52.270081,47.729919,69.188424,29,True,False,False,False,MK250028,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...
3,negative,551627,True,26.022,860,53.023256,46.976744,69.318761,33,True,False,False,False,MK250027,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...
4,negative,550702,True,26.02,859,53.201397,46.798603,69.363285,33,True,False,False,False,MK250026,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...


In [25]:
sequences = df['sequence'].to_numpy()
labels = df['staining'].to_numpy()
# priority = df['experimental'].to_numpy()
train_idx, test_idx, valid_idx = train_test_validation_split(sequences, 
                                                             labels=labels,  
                                                             alignment_mode='mmseqs2', 
                                                             threads = 2,
                                                             threshold = 0.3,
                                                             test_size = 0.15,
                                                             valid_size = 0.05
                                                            )

temp/seq_db exists and will be overwritten
createdb --dbtype 1 graphpart_api.fasta.tmp temp/seq_db 

MMseqs Version:       	13.45111
Database type         	1
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[=
Time for merging to seq_db_h: 0h 0m 0s 235ms
Time for merging to seq_db: 0h 0m 1s 715ms
Database type: Aminoacid
Time for processing: 0h 0m 7s 151ms


ln: temp/pref: File exists


KeyError: 'full'

# Homology partitioning (mmseq)

In [10]:
column_names = ["Cluster", "Sequences"]

df = pd.read_csv("../data/interim/clustering/clusterRes_cluster.tsv", sep="\t", header=None, names=column_names)
df.head()

Unnamed: 0,Cluster,Sequences
0,MN335248.1,MN335248.1
1,MK250029.1,MK250029.1
2,MK250028.1,MK250028.1
3,MK250028.1,MK250024.1
4,MK250028.1,MK250020.1


In [15]:
grouped = df.groupby('Cluster')['Sequences'].agg(list).reset_index()
grouped

Unnamed: 0,Cluster,Sequences
0,AB002632.1,"[AB002632.1, NC_001956.1, KC357596.1, NC_02156..."
1,AB009866.2,"[AB009866.2, NC_002321.1]"
2,AB012574.1,"[AB012574.1, NC_005949.1]"
3,AB043679.1,"[AB043679.1, NC_002363.1]"
4,AB044554.1,"[AB044554.1, NC_002486.1]"
...,...,...
5933,OR413342.1,[OR413342.1]
5934,OR413344.1,[OR413344.1]
5935,OR413347.1,[OR413347.1]
5936,OR437326.1,[OR437326.1]


# Traditional trainining and testing split

In [16]:
# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

features = ['genome_length', 'jumbophage', 'gc_%',
       'trna_count', 'cds_number', 'coding_capacity', 'positive_strand_%',
       'negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA',
       'molecule_type_RNA', 'molecule_type_ss-RNA', 'topology_circular','topology_linear']
# Target variable (dependent variable)
target = 'staining'

In [17]:
# Extract features and target
X = df[features]
y = df[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic regression model

In [18]:
# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [22]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)


In [23]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)


Accuracy: 0.7450716845878136
Balanced Accuracy: 0.5717191836799154
Log Loss: 0.5591185876984811
Mean Squared Error: 0.25492831541218636

F1 Score: 0.2751592356687898
Matthews Correlation Coefficient: 0.2575572836055283
Matthews Correlation Coefficient Details:  
     P: 634 
     Tp: 108 
     Fp: 43 
     N: 1_598 
     Tn: 1_555 
     Fn: 526

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.97      0.85      1598
           1       0.72      0.17      0.28       634

    accuracy                           0.75      2232
   macro avg       0.73      0.57      0.56      2232
weighted avg       0.74      0.75      0.68      2232



# Random Forest

In [19]:
# Train a logistic regression model
logreg = RandomForestClassifier()
logreg.fit(X_train, y_train)

In [20]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)

In [21]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.950694755714926
Balanced Accuracy: 0.9219307992222207
Log Loss: 0.15221563976229685
Mean Squared Error: 0.049305244285073956

F1 Score: 0.9064625850340136
Matthews Correlation Coefficient: 0.8758364686130411
Matthews Correlation Coefficient Details:  
     P: 622 
     Tp: 533 
     Fp: 21 
     N: 1_609 
     Tn: 1_588 
     Fn: 89

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1609
           1       0.96      0.86      0.91       622

    accuracy                           0.95      2231
   macro avg       0.95      0.92      0.94      2231
weighted avg       0.95      0.95      0.95      2231



# Random forest with weights

In [35]:
# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

# Target variable (dependent variable)
target = 'staining'
X = df[features]
y = df[target]

# Ensure y only contains valid labels ('negative' and 'positive')
y = y[(y == 'negative') | (y == 'positive')]

# Encode the target variable 'staining' to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Train a random forest model with class weighting
logreg = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})
logreg.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Calculate evaluation metrics
f1score = f1_score(y_test, y_pred, pos_label=1)  # Positive class is 1 after encoding
logloss = log_loss(y_test, y_pred_proba)
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Generate the classification report
classification_report_str = classification_report(y_test, y_pred)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.9551971326164874
Balanced Accuracy: 0.9296991902338492
Log Loss: 0.1677458489520838
Mean Squared Error: 0.044802867383512544

F1 Score: 0.9169435215946844
Matthews Correlation Coefficient: 0.8864261377564321
Matthews Correlation Coefficient Details:  
     P: 634 
     Tp: 552 
     Fp: 20 
     N: 1_598 
     Tn: 1_578 
     Fn: 82

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1598
           1       0.97      0.87      0.92       634

    accuracy                           0.96      2232
   macro avg       0.96      0.93      0.94      2232
weighted avg       0.96      0.96      0.95      2232



# Feature selection

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier()
sfm = SelectFromModel(model, threshold='mean')
fit = sfm.fit(X_train, y_train)
selected_features_sfm = X_train.columns[fit.get_support()]


In [97]:
selected_features_sfm

Index(['molGC (%)', 'Positive Strand (%)', 'Negative Strand (%)',
       'Coding Capacity (%)', 'tRNAs'],
      dtype='object')