# ███████████████████
# Setting Current Directory
## The Location of Model Files Being Used
# ███████████████████

In [None]:
cd C:\Users\File\Path\To\DatasetFiles\StorageFolder

In [None]:
ls

# ███████████████████
# Importing Required Packages
# ███████████████████

In [9]:
import pandas as pd #pandas Version 1.5.3
import numpy as np #numpy Version 1.24.3
import tensorflow as tf #tensorflow Version 2.15.0
import tensorflow_addons as tfa #tensorflow_addons Version 0.22.0

In [10]:
import sys
# Print Python version
print(f"Python Version: {sys.version}")

Python Version: 3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:47:18) [MSC v.1916 64 bit (AMD64)]


In [11]:
import importlib.metadata

# List of package names you want to check
packages = [
    'pandas',
    'numpy',
    'tensorflow',
    'tensorflow_addons',
]

# Print the version of each package
for package in packages:
    try:
        version = importlib.metadata.version(package)
        print(f"{package}: {version}")
    except importlib.metadata.PackageNotFoundError:
        print(f"{package} is not installed.")

pandas: 2.2.2
numpy: 1.24.3
tensorflow: 2.15.0
tensorflow_addons: 0.22.0


# ██████████████████████████████████
# Load the H4S2 Model and H4S2_cNon Model
# ██████████████████████████████████

In [12]:
# Load the models and name them
H4S2_model = tf.keras.models.load_model('H4S2_model.h5')
H4S2_cNon_model = tf.keras.models.load_model('H4S2_cNon_model.h5')




# ██████████████████████████████████
# Establish Encoding Function
# ██████████████████████████████████

In [13]:
def encode_sequences_to_numpy_array(working_df, sequence_column_name='sequence'):
    """
    Encode sequences from a text file into a numpy array of one-hot encoded sequences.

    Args:
        working_df (pandas.DataFrame): Input DataFrame containing the sequences.
        sequence_column_name (str): Name of the column in the text file that contains the sequences. Defaults to 'sequence'.

    Returns:
        one_hot_seq (numpy.ndarray): A numpy array containing the one-hot encoded sequences.
    """
    # Define a mapping from nucleotides to integers
    nt_to_int = {'A': 0, 'T': 1, 'G': 2, 'C': 3, 'a': 0, 't': 1, 'g': 2, 'c': 3}
        
    # Check if the sequence_column_name exists in the DataFrame
    if sequence_column_name not in working_df.columns:
        raise ValueError(f"The specified sequence_column_name '{sequence_column_name}' does not exist in the file.\nThe file contains: '{list(working_df.columns)}'")

    # Define a new column to store the one-hot encoded sequences
    working_df['one_hot_seq'] = ''

    # Loop through each row in the DataFrame
    for index, row in working_df.iterrows():
        # Convert the sequence to an integer representation
        int_seq = [nt_to_int[nt] for nt in row[sequence_column_name]]

        # One-hot encode the integer sequence
        one_hot_seq = tf.keras.utils.to_categorical(int_seq, num_classes=4)

        # Update the 'one_hot_seq' column with the one-hot encoded sequence
        working_df.at[index, 'one_hot_seq'] = one_hot_seq

    # Convert the 'one_hot_seq' column to a numpy array
    one_hot_seq = np.array(list(working_df['one_hot_seq']))
    
    # Flatten each array in ins
    one_hot_seq = np.reshape(one_hot_seq, (one_hot_seq.shape[0], -1))
    
    # Generate feature names based on sequence length
    sequence_length = len(working_df[sequence_column_name].iloc[0])  # Assuming all sequences are of the same length
    feature_names = [f"Position_{i+1}_{base}" for i in range(sequence_length) for base in ['A', 'T', 'G', 'C']]

    return one_hot_seq, feature_names

# ██████████████████████████████████
# Establish Prediction Function
# ██████████████████████████████████

In [14]:
# Function to make predictions using the neural network model
def make_predictions(model, encoded_sequences):
    # Make predictions using the neural network model
    predictions = model.predict(encoded_sequences)
    return predictions

# ██████████████████████████████████
# Make H4S2 Model Prediction
# ██████████████████████████████████

In [15]:
# Example sequences to predict H4-S2
sequences = ['AGTGAT', 'ATTGAT', 'TGTAAC', 'ACTTCT', 'TGCTTA']

# Convert the sequences into a DataFrame for compatibility
sequences_df = pd.DataFrame({'H4S2': sequences})

# Encode the sequences into a NumPy array
encoded_sequences, labels = encode_sequences_to_numpy_array(sequences_df, sequence_column_name='H4S2')

# Make predictions using the neural network model
predictions = make_predictions(H4S2_model, encoded_sequences)

# Check the shape of predictions and flatten if necessary
print("Predictions shape:", predictions.shape)  # Debugging line
if predictions.ndim > 1:
    predictions = predictions.flatten()  # Flatten the predictions if they are 2D

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Sequence': sequences,
    'Predicted_Readcount': predictions
})

print(predictions_df)

Predictions shape: (5, 1)
  Sequence  Predicted_Readcount
0   AGTGAT             1.100153
1   ATTGAT             0.484277
2   TGTAAC             0.083330
3   ACTTCT             0.005739
4   TGCTTA             0.002375


# ██████████████████████████████████
# Make H4S2_cNon Model Prediction
# ██████████████████████████████████

In [16]:
# Example sequences to predict H4-S2,N1-N3,N8,N9
sequences = ['AGTGATACACC', 'AGTGATCTACC', 'AGTGATTGGTA', 'AGTGATTCACC', 'AGTGATTTTCC', 'AGTGATATGCC', 'AGTGATGAACC']

# Convert the sequences into a DataFrame for compatibility
sequences_df = pd.DataFrame({'H4S2_N1N3_N8N9': sequences})

# Encode the sequences into a NumPy array
encoded_sequences, labels = encode_sequences_to_numpy_array(sequences_df, sequence_column_name='H4S2_N1N3_N8N9')

# Make predictions using the neural network model
predictions = make_predictions(H4S2_cNon_model, encoded_sequences)

# Check the shape of predictions and flatten if necessary
print("Predictions shape:", predictions.shape)  # Debugging line
if predictions.ndim > 1:
    predictions = predictions.flatten()  # Flatten the predictions if they are 2D

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'Sequence': sequences,
    'Predicted_Readcount': predictions
})

print(predictions_df)

Predictions shape: (7, 1)
      Sequence  Predicted_Readcount
0  AGTGATACACC             0.791120
1  AGTGATCTACC             0.336031
2  AGTGATTGGTA             0.020244
3  AGTGATTCACC             0.385307
4  AGTGATTTTCC             0.160189
5  AGTGATATGCC             0.220867
6  AGTGATGAACC             0.572104
