In [36]:
import pandas as pd
# from graph_part import train_test_validation_split, stratified_k_fold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score, matthews_corrcoef, log_loss, mean_squared_error, balanced_accuracy_score
from math import sqrt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_class_weight
import networkx as nx
from networkx.algorithms import community
import random

def matthews(y_true, y_pred):
    """
    Calculate the Matthews Correlation Coefficient and other metrics.
    """
    if type(y_true) == pd.Series:
        y_true = y_true.values

    P = len([x for x in y_true if x == 1])
    N = len([x for x in y_true if x == 0])

    Tp, Fp = 0, 0
    for i in range(len(y_true)):
        if y_true[i] == 1 and y_pred[i] == 1: Tp += 1
        elif y_true[i] == 0 and y_pred[i] == 1: Fp += 1

    Tn = N - Fp
    Fn = P - Tp

    try:
        mcc = (Tp * Tn - Fp * Fn) / sqrt(
            (Tn + Fn) * (Tn + Fp) * (Tp + Fn) * (Tp + Fp))
    except ZeroDivisionError:
        mcc = 0

    return (mcc, f" \n \
    P: {P:_} \n \
    Tp: {Tp:_} \n \
    Fp: {Fp:_} \n \
    N: {N:_} \n \
    Tn: {Tn:_} \n \
    Fn: {Fn:_}")


In [37]:
df = pd.read_csv("../data/processed/model_data.csv")

print(df.shape)
df.head()

(11156, 14)


Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
0,MN335248,negative,7045,False,60.298,13,84.615385,15.384615,88.828957,0,False,False,True,False
1,MK250029,negative,540217,True,25.796,830,47.108434,52.891566,68.324951,30,True,False,False,False
2,MK250028,negative,550053,True,26.012,859,52.270081,47.729919,69.188424,29,True,False,False,False
3,MK250027,negative,551627,True,26.022,860,53.023256,46.976744,69.318761,33,True,False,False,False
4,MK250026,negative,550702,True,26.02,859,53.201397,46.798603,69.363285,33,True,False,False,False


In [3]:
df['staining'].value_counts()

staining
negative    7932
positive    3224
Name: count, dtype: int64

Checking why df and df2 are different

In [58]:
df2 = pd.read_csv("../data/processed/model_data2.csv", index_col=0)

print(df2.shape)
df2.head()
print(df2.columns)

(11154, 17)
Index(['id', 'genome_length', 'gc_%', 'sequence', 'reverse_complement',
       'cds_number', 'positive_strand_%', 'negative_strand_%',
       'coding_capacity', 'trna_count', 'molecule_type_DNA',
       'molecule_type_RNA', 'molecule_type_ss-DNA', 'molecule_type_ss-RNA',
       'jumbophage', 'topology_circular', 'topology_linear'],
      dtype='object')


In [5]:
not_in_df = df[~df['Accession'].isin(df2['id'])]
not_in_df

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
11080,MPU46938,positive,15664,False,69.344,25,100.0,0.0,91.183606,0,True,False,False,False
11122,L67CG,positive,22195,False,35.99,39,53.846154,46.153846,92.507321,0,True,False,False,False
11126,BL5GENM,positive,2435,False,33.265,4,100.0,0.0,51.252567,0,True,False,False,False


### Homology partitioning (Graph-Part)

In [6]:
'''sequences = pd.read_csv('../data/interim/feature_engineering/feature_engineering_genomes.csv')
sequences = sequences[['id', 'sequence']]
df.reset_index()
df = df.merge(sequences, left_on='Accession', right_on = 'id', how='left')
df.head()

sequences = df['sequence'].to_numpy()
labels = df['staining'].to_numpy()
# priority = df['experimental'].to_numpy()
train_idx, test_idx, valid_idx = train_test_validation_split(sequences, 
                                                             labels=labels,  
                                                             alignment_mode='mmseqs2', 
                                                             threads = 2,
                                                             threshold = 0.3,
                                                             test_size = 0.15,
                                                             valid_size = 0.05
                                                            )'''

"sequences = pd.read_csv('../data/interim/feature_engineering/feature_engineering_genomes.csv')\nsequences = sequences[['id', 'sequence']]\ndf.reset_index()\ndf = df.merge(sequences, left_on='Accession', right_on = 'id', how='left')\ndf.head()\n\nsequences = df['sequence'].to_numpy()\nlabels = df['staining'].to_numpy()\n# priority = df['experimental'].to_numpy()\ntrain_idx, test_idx, valid_idx = train_test_validation_split(sequences, \n                                                             labels=labels,  \n                                                             alignment_mode='mmseqs2', \n                                                             threads = 2,\n                                                             threshold = 0.3,\n                                                             test_size = 0.15,\n                                                             valid_size = 0.05\n                                                            )"

### Homology partitioning (mmseq)

In [7]:
'''column_names = ["Cluster", "Sequences"]

df = pd.read_csv("../data/interim/clustering/clusterRes_cluster.tsv", sep="\t", header=None, names=column_names)
grouped = df.groupby('Cluster')['Sequences'].agg(list).reset_index()
grouped'''

'column_names = ["Cluster", "Sequences"]\n\ndf = pd.read_csv("../data/interim/clustering/clusterRes_cluster.tsv", sep="\t", header=None, names=column_names)\ngrouped = df.groupby(\'Cluster\')[\'Sequences\'].agg(list).reset_index()\ngrouped'

### Traditional trainining and testing split

In [8]:
'''# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

features = ['genome_length', 'jumbophage', 'gc_%',
       'trna_count', 'cds_number', 'coding_capacity', 'positive_strand_%',
       'negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA',
       'molecule_type_RNA', 'molecule_type_ss-RNA', 'topology_circular','topology_linear']
# Target variable (dependent variable)
target = 'staining'

# Extract features and target
X = df[features]
y = df[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)'''

"# Features (independent variables)\nfeatures = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',\n            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',\n            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']\n\nfeatures = ['genome_length', 'jumbophage', 'gc_%',\n       'trna_count', 'cds_number', 'coding_capacity', 'positive_strand_%',\n       'negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA',\n       'molecule_type_RNA', 'molecule_type_ss-RNA', 'topology_circular','topology_linear']\n# Target variable (dependent variable)\ntarget = 'staining'\n\n# Extract features and target\nX = df[features]\ny = df[target]\n\n# Split the data into training and testing sets (80% train, 20% test)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"

# Homology Partitioning with from MASH and NetworkX output

I will load the mash distances and use them to separate the DNA into communities. Two nodes will be used for validation: 934 (negative) and 469 (positive). These two nodes are taken from the NetworkX graph. The rest of the data will be used to train the model

In [38]:
# Assuming 'distances_df' is your DataFrame containing the distance and identity information
distances_df = pd.read_csv('../data/interim/clustering/mash_distances.csv')  # Load your CSV file

# Create a new graph
G = nx.Graph()

# Add edges to the graph
for index, row in distances_df.iterrows():
    # Inverse the identity value to use it as 'distance' if needed
    # Or you can directly use the identity value depending on how you want to cluster
    distance = 1 - row['Identity'] / 100  # Convert percentage to a float value between 0 and 1
    G.add_edge(row['Genome1'], row['Genome2'], weight=distance)

# Now G contains your graph with weighted edges based on sequence identity
# You can now use various clustering or community detection algorithms provided by NetworkX

# For example, to get clusters using the greedy modularity community detection algorithm:

communities = community.greedy_modularity_communities(G, weight='weight')
# This will return a list of sets, where each set is a community of nodes (sequences)

print(len(communities))
# To see the communities:
counter = 0
for i, community_set in enumerate(communities, start=1):
    print(f"Community {i}: {community_set}")
    counter += 1
    if counter == 5:
        break

1116
Community 1: frozenset({'MK250021', 'MK250025', 'MK250023', 'MK250027', 'MK250026', 'MK250028', 'MK250024', 'MK250022', 'MK250029', 'MN335248'})
Community 2: frozenset({'MK250020', 'MG592402', 'MK250018', 'MK250019', 'MK250017', 'MG592455', 'MG592615', 'MG592427', 'MK250016', 'MK250015'})
Community 3: frozenset({'AF334111', 'OR204651', 'HM208303', 'OR413347', 'OR204650', 'OR204649', 'MG592394', 'OR413345', 'OR204652', 'HM066936'})
Community 4: frozenset({'OQ451773', 'OR290970', 'OR413344', 'MN923124', 'MZ892903', 'AY129330', 'OQ745667', 'ON212266', 'OR413342', 'ON470617'})
Community 5: frozenset({'AY129332', 'JX262223', 'AY129336', 'KY385384', 'JX262224', 'JX262221', 'JX262225', 'JX262222', 'KY385381', 'EU826469'})


In [39]:
# Let's assume 'communities' is the list of frozensets you received from the output of the community detection algorithm
# Initialize the separate variables for storing removed communities
removed_communities = []

# Community numbers to remove (1-indexed as per your enumeration, so subtract 1 for 0-indexed Python lists)
communities_to_remove = [934 - 1, 469 - 1]  # -1 because lists are 0-indexed in Python

# Sort the list in reverse so removing by index doesn't affect the order of unvisited items
for community_index in sorted(communities_to_remove, reverse=True):
    # Remove the community and store it in the 'removed_communities' list if it exists
    try:
        removed_communities.append(communities.pop(community_index)) # pop method removes by index
    except IndexError as e:
        print(f"No community at index: {community_index + 1}")  # Add 1 to match your enumeration

# At this point, 'communities' variable has communities 934 and 469 removed
# 'removed_communities' contains the removed communities

# To verify the result, you can print the remaining and removed communities
#  print("Remaining communities:")
# for i, community_set in enumerate(communities, start=1):
#     print(f"Community {i}: {community_set}")

print("\nRemoved communities:")
for i, community_set in enumerate(removed_communities, start=1):
    print(f"Removed Community {i}: {community_set}")


Removed communities:
Removed Community 1: frozenset({'KX349298', 'KX349299', 'KX349301', 'KX349302', 'KX349306', 'KX349300', 'KX349297', 'KX349305', 'KX349304', 'KX349303'})
Removed Community 2: frozenset({'MN871496', 'MN871497', 'MN871495', 'MN871494', 'MN871498', 'MN871501', 'MN871499', 'MN871500', 'MN871502', 'MN871503'})


Making sure the communities were removed

In [40]:
# Function to find to which community (index) a given entry belongs
def find_community(entry, communities_list):
    for index, community in enumerate(communities_list):
        if entry in community:
            return f"Community {index + 1}"  # +1 because index in Python starts at 0
    return "Entry not found in any community"


# Example usage:
entry = 'KX349302' # Replace this with any entry you're searching for
community2 = find_community(entry, communities)
print(f"The entry {entry} belongs to {community2}")


The entry KX349302 belongs to Entry not found in any community


In [41]:
# Community to find the index of
community_to_find = frozenset({'MN871494', 'MN871501', 'MN871498', 'MN871495', 'MN871497', 'MN871503', 'MN871496', 'MN871500', 'MN871499', 'MN871502'})

# Find the index of the community
community_index = next((index for index, community in enumerate(communities) if community == community_to_find), -1)

community_index

-1

# Obtain training and testing data based on communities

In [13]:
df['Accession'].head()

0    MN335248
1    MK250029
2    MK250028
3    MK250027
4    MK250026
Name: Accession, dtype: object

In [42]:
# Create a mapping dictionary from accession code to community index
accession_to_community = {}
for i, community in enumerate(communities):
    for accession in community:
        accession_to_community[accession] = i

# Map the 'Accession' column to a new 'Community' column
df['Community'] = df['Accession'].map(accession_to_community)
# df[['Accession','Community']].head()

# Now, decide how to split the communities into train and test sets
# For simplicity, let's say you want 80% of communities in the training set
# First, let's decide which community indices will go into training

# Set the random seed for reproducibility
random.seed(42)

# Shuffle the community indices to randomly select 80% for training
community_indices = list(range(len(communities)))
random.shuffle(community_indices)
train_community_count = int(0.8 * len(community_indices))
train_community_indices = community_indices[:train_community_count]
test_community_indices = community_indices[train_community_count:]

# Now, we can split the dataframe based on these indices
train_df = df[df['Community'].isin(train_community_indices)]
test_df = df[df['Community'].isin(test_community_indices)]

# The 'train_df' dataframe contains the training set, and 'test_df' contains the test set


### Checking the split
# Collect community indices in the training and test set
train_communities = set(train_df['Community'].unique())
test_communities = set(test_df['Community'].unique())

# Check for intersection
common_communities = train_communities.intersection(test_communities)

# If the intersection is empty, then the split is correct; otherwise, there's an issue
if common_communities:
    print(f"Error: Communities with indices {common_communities} appear in both training and test sets.")
else:
    print("Success: All communities are exclusively in either the training or the test set.")

Success: All communities are exclusively in either the training or the test set.


In [43]:
# Set 'Accession' as the index for both train_df and test_df
train_df = train_df.set_index('Accession')
test_df = test_df.set_index('Accession')

# Drop the 'Community' column from both dataframes
train_df = train_df.drop(columns='Community')
test_df = test_df.drop(columns='Community')


In [44]:
print(train_df['staining'].value_counts())
print(test_df['staining'].value_counts())

staining
negative    6320
positive    2583
Name: count, dtype: int64
staining
negative    1602
positive     628
Name: count, dtype: int64


In [45]:
train_df.columns

Index(['staining', 'Genome Length (bp)', 'Jumbophage', 'molGC (%)',
       'Number CDS', 'Positive Strand (%)', 'Negative Strand (%)',
       'Coding Capacity (%)', 'tRNAs', 'Molecule_DNA', 'Molecule_RNA',
       'Molecule_ss-DNA', 'Molecule_ss-RNA'],
      dtype='object')

In [46]:
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

# features = ['genome_length', 'jumbophage', 'gc_%',
#        'trna_count', 'cds_number', 'coding_capacity', 'positive_strand_%',
#        'negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA',
#        'molecule_type_RNA', 'molecule_type_ss-RNA', 'topology_circular','topology_linear']
# Target variable (dependent variable)
target = 'staining'

# Extract features and target
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Logistic regression model

In [47]:
# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [48]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)


In [49]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)


Accuracy: 0.742152466367713
Balanced Accuracy: 0.5572035751488983
Log Loss: 0.5412084184515018
Mean Squared Error: 0.257847533632287

F1 Score: 0.22611036339165547
Matthews Correlation Coefficient: 0.23268058116078824
Matthews Correlation Coefficient Details:  
     P: 628 
     Tp: 84 
     Fp: 31 
     N: 1_602 
     Tn: 1_571 
     Fn: 544

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.98      0.85      1602
           1       0.73      0.13      0.23       628

    accuracy                           0.74      2230
   macro avg       0.74      0.56      0.54      2230
weighted avg       0.74      0.74      0.67      2230



# Random Forest

In [50]:
# Train a logistic regression model
logreg = RandomForestClassifier()
logreg.fit(X_train, y_train)

In [51]:
# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Encode labels for mean squared error calculation
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_encoded = label_encoder.transform(y_pred)


In [52]:
# Calculate F1 score
positive_label = 'positive'
f1score = f1_score(y_test, y_pred, pos_label=positive_label)

# Calculate log loss using predicted probabilities
logloss = log_loss(y_test, y_pred_proba)

# Calculate mean squared error using encoded labels
mse = mean_squared_error(y_test_encoded, y_pred_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test_encoded, y_pred_encoded)

# Calculate Matthews Correlation Coefficient (MCC)
mcc, mcc_details = matthews(y_test_encoded, y_pred_encoded)

# Generate the classification report
classification_report_str = classification_report(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.9515695067264573
Balanced Accuracy: 0.9285347932918246
Log Loss: 0.20378862141388368
Mean Squared Error: 0.0484304932735426

F1 Score: 0.9105960264900663
Matthews Correlation Coefficient: 0.8787618472887772
Matthews Correlation Coefficient Details:  
     P: 628 
     Tp: 550 
     Fp: 30 
     N: 1_602 
     Tn: 1_572 
     Fn: 78

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97      1602
           1       0.95      0.88      0.91       628

    accuracy                           0.95      2230
   macro avg       0.95      0.93      0.94      2230
weighted avg       0.95      0.95      0.95      2230



# Random forest with weights

In [54]:
# Features (independent variables)
features = ['Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Number CDS',
            'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)',
            'tRNAs', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-DNA', 'Molecule_ss-RNA']

# Target variable (dependent variable)
target = 'staining'
X = df[features]
y = df[target]

# Ensure y only contains valid labels ('negative' and 'positive')
y = y[(y == 'negative') | (y == 'positive')]

# Encode the target variable 'staining' to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Train a random forest model with class weighting
logreg = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})
logreg.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

# Predict class labels for F1 score calculation
y_pred = logreg.predict(X_test)

# Calculate evaluation metrics
f1score = f1_score(y_test, y_pred, pos_label=1)  # Positive class is 1 after encoding
logloss = log_loss(y_test, y_pred_proba)
mse = mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Generate the classification report
classification_report_str = classification_report(y_test, y_pred)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Balanced Accuracy:', balanced_accuracy)
print('Log Loss:', logloss)  # Use the calculated log loss
print('Mean Squared Error:', mse)
print()
print('F1 Score:', f1score)
print('Matthews Correlation Coefficient:', mcc)
print('Matthews Correlation Coefficient Details:', mcc_details)
print()
print('Classification Report:\n', classification_report_str)

Accuracy: 0.9560931899641577
Balanced Accuracy: 0.9298492200424031
Log Loss: 0.16953601294771706
Mean Squared Error: 0.04390681003584229

F1 Score: 0.9183333333333333
Matthews Correlation Coefficient: 0.8908453121244858
Matthews Correlation Coefficient Details:  
     P: 606 
     Tp: 534 
     Fp: 23 
     N: 1_614 
     Tn: 1_591 
     Fn: 72

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1598
           1       0.97      0.87      0.92       634

    accuracy                           0.96      2232
   macro avg       0.96      0.93      0.94      2232
weighted avg       0.96      0.96      0.96      2232



# Validation on removed communities

In [67]:
df[["Accession", "staining"]].head()

Unnamed: 0,Accession,staining
0,MN335248,negative
1,MK250029,negative
2,MK250028,negative
3,MK250027,negative
4,MK250026,negative


In [62]:
# Flatten the removed_communities frozensets into a list
removed_accessions = [item for community in removed_communities for item in community]

# Filter the original dataframe to get the validation set based on Accession being in removed_accessions
validation_df = df[df['Accession'].isin(removed_accessions)].copy().set_index('Accession')

# Prepare features and true labels from the validation set
X_validation = validation_df[features]
y_validation_true = validation_df[target]

# Encode 'staining' labels to numerical values using the same encoder used during training
y_validation_encoded = label_encoder.transform(y_validation_true)

# Predict the staining for the validation set and get probabilities for both classes
y_validation_pred_proba = logreg.predict_proba(X_validation)

# Obtain the predicted class labels (as strings, which you already have)
y_validation_pred = logreg.predict(X_validation)

# Since y_validation_pred contains strings, we don't need to inverse_transform it again
validation_df['Predicted Staining'] = y_validation_pred  # Direct assignment

# Now validation_df will have the predicted staining, add probability columns based on predict_proba output
validation_df['Probability Negative'] = y_validation_pred_proba[:, 0]  # Probability of 'negative'
validation_df['Probability Positive'] = y_validation_pred_proba[:, 1]  # Probability of 'positive'

# Find the entries that the model predicted incorrectly
incorrect_predictions = validation_df[validation_df[target] != validation_df['Predicted Staining']]

# Evaluate the model on the validation set using the same metrics as before
f1score_validation = f1_score(y_validation_encoded, label_encoder.transform(y_validation_pred), pos_label=label_encoder.transform([positive_label])[0])
logloss_validation = log_loss(y_validation_encoded, y_validation_pred_proba)
mse_validation = mean_squared_error(y_validation_encoded, label_encoder.transform(y_validation_pred))
accuracy_validation = accuracy_score(y_validation_encoded, label_encoder.transform(y_validation_pred))
balanced_accuracy_validation = balanced_accuracy_score(y_validation_encoded, label_encoder.transform(y_validation_pred))
classification_report_validation = classification_report(y_validation_encoded, label_encoder.transform(y_validation_pred))

# Print the evaluation metrics for the validation set
print('Validation Accuracy:', accuracy_validation)
print('Validation Balanced Accuracy:', balanced_accuracy_validation)
print('Validation Log Loss:', logloss_validation)
print('Validation Mean Squared Error:', mse_validation)
print()
print('Validation F1 Score:', f1score_validation)
print()
print('Validation Classification Report:\n', classification_report_validation)

# Print the incorrect predictions for review
if incorrect_predictions.empty:
    incorrect_pred_str = "There were no incorrect predictions\n"
else:
    # If it's not empty, convert the DataFrame to a string for the report
    incorrect_pred_str = f"Incorrect Predictions:\n{incorrect_predictions}\n"

# Print the incorrect predictions message
print(incorrect_pred_str)


Validation Accuracy: 1.0
Validation Balanced Accuracy: 1.0
Validation Log Loss: 0.12918411483878942
Validation Mean Squared Error: 0.0

Validation F1 Score: 1.0

Validation Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

There were no incorrect predictions



In [31]:
y_validation_pred

array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative'],
      dtype=object)

In [77]:
validation_df

Unnamed: 0_level_0,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA,Community,Probability Negative,Probability Positive,Predicted Staining
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
MN871444,negative,269351,True,45.892,333,89.189189,10.810811,94.31114,0,True,False,False,False,,0.96,0.04,negative
MN871443,positive,58036,False,39.849,96,86.458333,13.541667,87.967813,1,True,False,False,False,,0.01,0.99,positive
MN871442,negative,223245,True,38.527,393,14.503817,85.496183,93.604784,8,True,False,False,False,,0.97,0.03,negative
MN871441,negative,228419,True,37.81,430,14.186047,85.813953,90.58616,10,True,False,False,False,,0.99,0.01,negative
MN871440,negative,221312,True,41.954,326,85.889571,14.110429,93.305379,22,True,False,False,False,,0.99,0.01,negative
OM913599,negative,45276,False,47.4,66,100.0,0.0,91.459051,0,True,False,False,False,,0.86,0.14,negative
OM913598,negative,43136,False,52.587,55,98.181818,1.818182,92.477281,0,True,False,False,False,,0.98,0.02,negative
OM913597,negative,42433,False,52.688,54,100.0,0.0,92.277237,0,True,False,False,False,,1.0,0.0,negative
OM810291,negative,57429,False,52.503,69,100.0,0.0,88.885406,2,True,False,False,False,,0.97,0.03,negative
ON000910,negative,249513,True,41.215,386,14.766839,85.233161,92.904578,26,True,False,False,False,,1.0,0.0,negative


# Feature selection

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier()
sfm = SelectFromModel(model, threshold='mean')
fit = sfm.fit(X_train, y_train)
selected_features_sfm = X_train.columns[fit.get_support()]


In [57]:
selected_features_sfm

Index(['Genome Length (bp)', 'molGC (%)', 'Number CDS', 'Positive Strand (%)',
       'Negative Strand (%)', 'Coding Capacity (%)'],
      dtype='object')