In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score, matthews_corrcoef, log_loss, mean_squared_error, balanced_accuracy_score
from math import sqrt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight

def matthews(y_true, y_pred):
    """
    Calculate the Matthews Correlation Coefficient and other metrics.
    """
    if type(y_true) == pd.Series:
        y_true = y_true.values

    P = len([x for x in y_true if x == 1])
    N = len([x for x in y_true if x == 0])

    Tp, Fp = 0, 0
    for i in range(len(y_true)):
        if y_true[i] == 1 and y_pred[i] == 1: Tp += 1
        elif y_true[i] == 0 and y_pred[i] == 1: Fp += 1

    Tn = N - Fp
    Fn = P - Tp

    try:
        mcc = (Tp * Tn - Fp * Fn) / sqrt(
            (Tn + Fn) * (Tn + Fp) * (Tp + Fn) * (Tp + Fp))
    except ZeroDivisionError:
        mcc = 0

    return (mcc, f" \n \
    P: {P:_} \n \
    Tp: {Tp:_} \n \
    Fp: {Fp:_} \n \
    N: {N:_} \n \
    Tn: {Tn:_} \n \
    Fn: {Fn:_}")


In [17]:
df = pd.read_csv("../data/processed/model_data.csv", index_col=0)
df.head()

Unnamed: 0_level_0,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MN335248,negative,7045,0,60.298,13,84.615385,15.384615,88.828957,0,0,0,1,0
MK250029,negative,540217,1,25.796,830,47.108434,52.891566,68.324951,30,1,0,0,0
MK250028,negative,550053,1,26.012,859,52.270081,47.729919,69.188424,29,1,0,0,0
MK250027,negative,551627,1,26.022,860,53.023256,46.976744,69.318761,33,1,0,0,0
MK250026,negative,550702,1,26.02,859,53.201397,46.798603,69.363285,33,1,0,0,0


In [18]:
df['staining'].value_counts()

staining
negative    7932
positive    3224
Name: count, dtype: int64

In [19]:
# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

# Target variable (dependent variable)
target = 'staining'

In [20]:
# Extract features and target
X = df[features]
y = df[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic regression model

In [21]:
# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [22]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)


In [23]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)


Accuracy: 0.7450716845878136
Balanced Accuracy: 0.5717191836799154
Log Loss: 0.5591185876984811
Mean Squared Error: 0.25492831541218636

F1 Score: 0.2751592356687898
Matthews Correlation Coefficient: 0.2575572836055283
Matthews Correlation Coefficient Details:  
     P: 634 
     Tp: 108 
     Fp: 43 
     N: 1_598 
     Tn: 1_555 
     Fn: 526

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.97      0.85      1598
           1       0.72      0.17      0.28       634

    accuracy                           0.75      2232
   macro avg       0.73      0.57      0.56      2232
weighted avg       0.74      0.75      0.68      2232



# Random Forest

In [24]:
# Train a logistic regression model
logreg = RandomForestClassifier()
logreg.fit(X_train, y_train)

In [25]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)

In [26]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.9543010752688172
Balanced Accuracy: 0.9290734080060643
Log Loss: 0.14200359557742584
Mean Squared Error: 0.0456989247311828

F1 Score: 0.9154228855721394
Matthews Correlation Coefficient: 0.8864261377564321
Matthews Correlation Coefficient Details:  
     P: 634 
     Tp: 552 
     Fp: 20 
     N: 1_598 
     Tn: 1_578 
     Fn: 82

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1598
           1       0.97      0.87      0.92       634

    accuracy                           0.95      2232
   macro avg       0.96      0.93      0.94      2232
weighted avg       0.95      0.95      0.95      2232



# Random forest with weights

In [35]:
# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

# Target variable (dependent variable)
target = 'staining'
X = df[features]
y = df[target]

# Ensure y only contains valid labels ('negative' and 'positive')
y = y[(y == 'negative') | (y == 'positive')]

# Encode the target variable 'staining' to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Train a random forest model with class weighting
logreg = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})
logreg.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Calculate evaluation metrics
f1score = f1_score(y_test, y_pred, pos_label=1)  # Positive class is 1 after encoding
logloss = log_loss(y_test, y_pred_proba)
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Generate the classification report
classification_report_str = classification_report(y_test, y_pred)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.9551971326164874
Balanced Accuracy: 0.9296991902338492
Log Loss: 0.1677458489520838
Mean Squared Error: 0.044802867383512544

F1 Score: 0.9169435215946844
Matthews Correlation Coefficient: 0.8864261377564321
Matthews Correlation Coefficient Details:  
     P: 634 
     Tp: 552 
     Fp: 20 
     N: 1_598 
     Tn: 1_578 
     Fn: 82

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1598
           1       0.97      0.87      0.92       634

    accuracy                           0.96      2232
   macro avg       0.96      0.93      0.94      2232
weighted avg       0.96      0.96      0.95      2232



# Feature selection

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier()
sfm = SelectFromModel(model, threshold='mean')
fit = sfm.fit(X_train, y_train)
selected_features_sfm = X_train.columns[fit.get_support()]


In [97]:
selected_features_sfm

Index(['molGC (%)', 'Positive Strand (%)', 'Negative Strand (%)',
       'Coding Capacity (%)', 'tRNAs'],
      dtype='object')