## last vectorizer
- error of ValueError: could not convert string to float:

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
import joblib

# Load the data
df = pd.read_pickle("../data/processed/final_processed_data.pkl")
# Convert DNA sequence columns to string type
df['Parent_full_DNA_Seq'] = df['Parent_full_DNA_Seq'].astype(str)
df['Child_full_DNA_Seq'] = df['Child_full_DNA_Seq'].astype(str)
# Load vectorizer as kmer
vectorizer = joblib.load('../data/interim/kmer_model.sav')
# Select relevant columns
df_pre = df[['Parent_full_DNA_Seq','Child_full_DNA_Seq','target']]
# Split data into train and test sets
x = df_pre.drop('target', axis=1)
y = df_pre['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)



# Define transformer for k-mer embedding
class KmerEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_ch = self.vectorizer.fit_transform(X_copy['Child_full_DNA_Seq'])
        X_p = self.vectorizer.fit_transform(X_copy['Parent_full_DNA_Seq'])
        kmer_embeddings_c = X_ch.toarray()
        kmer_embeddings_p = X_p.toarray()
        kmer_embeddings_p = np.pad(kmer_embeddings_p, ((0, 0), (0, 1)), 'constant')
        for i in range(len(kmer_embeddings_c[0])):
            column_name_child = 'child_gene_k_' + str(i)
            column_name_parent = 'parent_gene_k_' + str(i)
            X_copy[column_name_child] = [kmer_embeddings_c[j][i] for j in range(len(X_copy))]
            X_copy[column_name_parent] = [kmer_embeddings_p[j][i] for j in range(len(X_copy))]
        return X_copy

# Define custom transformer to drop specific columns
class DropSpecificColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for column in self.columns_to_drop:
            if column in X_copy.columns:
                X_copy.drop(column, axis=1, inplace=True)
        return X_copy

# Define the pipeline
pipeline = Pipeline([
    ('kmer_embedding', KmerEmbeddingTransformer(vectorizer)),
    ('drop_specific_columns', DropSpecificColumns(columns_to_drop=['child_gene_k_64', 'parent_gene_k_64'])),
    ('scaler', MinMaxScaler()),
    ('model', XGBClassifier(
        learning_rate=0.3,
        n_estimators=15000,
        max_depth=15
    ))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Save the pipeline
joblib.dump(pipeline, 'kmer_embedding_pipeline.pkl')

# Load the pipeline
pipeline_loaded = joblib.load('kmer_embedding_pipeline.pkl')

# Predictions
y_pred_test = pipeline.predict(X_test)
# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Test Accuracy:", accuracy_test)

# 25 min to run test/ 1


## npy

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import joblib
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_pickle("../data/processed/final_processed_data.pkl")

# Convert DNA sequence columns to string type
df['Parent_full_DNA_Seq'] = df['Parent_full_DNA_Seq'].astype(str)
df['Child_full_DNA_Seq'] = df['Child_full_DNA_Seq'].astype(str)

# Load vectorizer as kmer
vectorizer = joblib.load('../data/interim/kmer_model.sav')

# Load precomputed k-mer embeddings for training data
kmer_embeddings_c = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_c1.npy')
kmer_embeddings_p = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_p1.npy')

# Select relevant columns
df_pre = df[['Parent_full_DNA_Seq', 'Child_full_DNA_Seq', 'target']]

# Split data into train and test sets
x = df_pre.drop('target', axis=1)
y = df_pre['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train = X_train[:len(kmer_embeddings_c)]
y_train = y_train[:len(kmer_embeddings_c)]


# Truncate k-mer embeddings arrays
kmer_embeddings_c_truncated = kmer_embeddings_c[:len(X_train)]
kmer_embeddings_p_truncated = kmer_embeddings_p[:len(X_train)]


# Define transformer for k-mer embedding
class KmerEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, kmer_embeddings_c, kmer_embeddings_p):
        self.kmer_embeddings_c = kmer_embeddings_c
        self.kmer_embeddings_p = kmer_embeddings_p

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        max_length = min(len(X_copy), len(self.kmer_embeddings_c))
        X_copy = X_copy.iloc[:max_length]
# Check the length of X_copy
        print("Length of X_copy:", len(X_copy))

        # Check the length of kmer_embeddings_c and kmer_embeddings_p
        print("Length of kmer_embeddings_c:", len(kmer_embeddings_c))
        print("Length of kmer_embeddings_p:", len(kmer_embeddings_p))

        # Check the consistency of train-test split
        print("Length of X_train:", len(X_train))
        print("Length of X_test:", len(X_test))
        print("Length of y_train:", len(y_train))
        print("Length of y_test:", len(y_test))

        
        for i in range(len(self.kmer_embeddings_c[0])):
            X_copy[f'child_gene_k_{i}'] = [self.kmer_embeddings_c[j][i] for j in range(len(X_copy))]
            X_copy[f'parent_gene_k_{i}'] = [self.kmer_embeddings_p[j][i] for j in range(len(X_copy))]
        return X_copy
# Define custom transformer to drop specific columns
class DropSpecificColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for column in self.columns_to_drop:
            if column in X_copy.columns:
                X_copy.drop(column, axis=1, inplace=True)
        return X_copy
    
    

# Transform data using KmerEmbeddingTransformer
kmer_transformer = KmerEmbeddingTransformer(kmer_embeddings_c_truncated, kmer_embeddings_p_truncated)
X_train_transformed = kmer_transformer.fit_transform(X_train)
X_test_transformed = kmer_transformer.transform(X_test)

# Drop 'Parent_full_DNA_Seq' and 'Child_full_DNA_Seq' columns
X_train_transformed.drop(['Parent_full_DNA_Seq', 'Child_full_DNA_Seq'], axis=1, inplace=True)
X_test_transformed.drop(['Parent_full_DNA_Seq', 'Child_full_DNA_Seq'], axis=1, inplace=True)

# Drop specific columns
column_dropper = DropSpecificColumns(columns_to_drop=['child_gene_k_64', 'parent_gene_k_64'])
X_train_transformed = column_dropper.fit_transform(X_train_transformed)
X_test_transformed = column_dropper.transform(X_test_transformed)

# # Define the main pipeline with preprocessing steps
# pipeline = Pipeline([
#     ('scaler', MinMaxScaler()),
#     ('model', SVC(kernel='rbf', C=0.5, gamma='scale'))
# ])                                          
                                              
# Define the main pipeline with preprocessing steps
pipeline = Pipeline([
    # ('preprocessor', preprocessor),  # Preprocess DNA sequences
    #('kmer_embedding', KmerEmbeddingTransformer(kmer_embeddings_c, kmer_embeddings_p)),
    #('drop_specific_columns', DropSpecificColumns(columns_to_drop=['child_gene_k_64', 'parent_gene_k_64'])),
    ('scaler', MinMaxScaler()),
    ('model', XGBClassifier(
        learning_rate=0.3,
        n_estimators=15000,
        max_depth=15
    ))
])


Length of X_copy: 40938
Length of kmer_embeddings_c: 40938
Length of kmer_embeddings_p: 40938
Length of X_train: 40938
Length of X_test: 11757
Length of y_train: 40938
Length of y_test: 11757


  X_copy[f'parent_gene_k_{i}'] = [self.kmer_embeddings_p[j][i] for j in range(len(X_copy))]
  X_copy[f'child_gene_k_{i}'] = [self.kmer_embeddings_c[j][i] for j in range(len(X_copy))]


Length of X_copy: 11757
Length of kmer_embeddings_c: 40938
Length of kmer_embeddings_p: 40938
Length of X_train: 40938
Length of X_test: 11757
Length of y_train: 40938
Length of y_test: 11757


In [3]:
# Fit the pipeline using transformed data
pipeline.fit(X_train_transformed, y_train)
# Save the pipeline
joblib.dump(pipeline, 'kmer_embedding_pipeline.pkl')

# # Load the pipeline
pipeline_loaded = joblib.load('kmer_embedding_pipeline.pkl')

# Evaluate the model
y_pred_test = pipeline.predict(X_test_transformed)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

#time 4 min 10 sec
# Test Accuracy: 0.6273709279578124 / test 1 / SVM
# ------------------------------------
#time 31 min 34 sec 
# Test Accuracy: 0.5374670409117972 / test 2  / XGBClassifier

KeyboardInterrupt: 

# Test 2:

In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
df = pd.read_pickle("../data/processed/final_processed_data.pkl")
df.drop(df.tail(100).index,inplace=True)

# Convert DNA sequence columns to string type
df['Parent_full_DNA_Seq'] = df['Parent_full_DNA_Seq'].astype(str)
df['Child_full_DNA_Seq'] = df['Child_full_DNA_Seq'].astype(str)

# Load precomputed k-mer embeddings for training data
kmer_embeddings_c = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_c1.npy')
kmer_embeddings_p = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_p1.npy')

# Select relevant columns
df_pre = df[['Parent_full_DNA_Seq', 'Child_full_DNA_Seq', 'target']]

# Split data into train and test sets
x = df_pre.drop('target', axis=1)
y = df_pre['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train = X_train[:len(kmer_embeddings_c)]
y_train = y_train[:len(kmer_embeddings_c)]

# Define transformer for k-mer embedding
class KmerEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, kmer_embeddings_c, kmer_embeddings_p):
        self.kmer_embeddings_c = kmer_embeddings_c
        self.kmer_embeddings_p = kmer_embeddings_p

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        max_length = min(len(X_copy), len(self.kmer_embeddings_c))
        X_copy = X_copy.iloc[:max_length]

        for i in range(len(self.kmer_embeddings_c[0])):
            X_copy[f'child_gene_k_{i}'] = [self.kmer_embeddings_c[j][i] for j in range(len(X_copy))]
            X_copy[f'parent_gene_k_{i}'] = [self.kmer_embeddings_p[j][i] for j in range(len(X_copy))]
        return X_copy

# Define custom transformer to drop specific columns
class DropSpecificColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1, errors='ignore').copy()

# Define the main pipeline with preprocessing and model training steps
#----------------------------------------------------------------------------------
# PIPELINE for XGB Classifier
#----------------------------------------------------------------------------------
# pipeline = Pipeline([
#     ('kmer_transformer', KmerEmbeddingTransformer(kmer_embeddings_c, kmer_embeddings_p)),
#     ('drop_columns', DropSpecificColumns(columns_to_drop=['Parent_full_DNA_Seq', 'Child_full_DNA_Seq'])),
#     ('scaler', MinMaxScaler()),
#     ('model', XGBClassifier(
#         learning_rate=0.3,
#         n_estimators=15000,
#         max_depth=15
#     ))
# ])

# Test Accuracy: 0.5411244365059114 / test 1 / XGBClassifier
# Time 34 min 10 sec
#----------------------------------------------------------------------------------
# PIPELINE for SVM Classifier
#----------------------------------------------------------------------------------
pipeline = Pipeline([
    ('kmer_transformer', KmerEmbeddingTransformer(kmer_embeddings_c, kmer_embeddings_p)),
    ('drop_columns', DropSpecificColumns(columns_to_drop=['Parent_full_DNA_Seq', 'Child_full_DNA_Seq'])),
    ('scaler', MinMaxScaler()),
    ('model', SVC(kernel='rbf', C=0.5, gamma='scale'))
    
])

# Test Accuracy: 0.6268273916388818 / test 2 / SVM
# Time 4 min 10 sec
#----------------------------------------------------------------------------------

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred_test = pipeline.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

# # Save the pipeline
# joblib.dump(pipeline, 'kmer_embedding_pipeline.pkl')


  X_copy[f'parent_gene_k_{i}'] = [self.kmer_embeddings_p[j][i] for j in range(len(X_copy))]
  X_copy[f'child_gene_k_{i}'] = [self.kmer_embeddings_c[j][i] for j in range(len(X_copy))]
  X_copy[f'parent_gene_k_{i}'] = [self.kmer_embeddings_p[j][i] for j in range(len(X_copy))]
  X_copy[f'child_gene_k_{i}'] = [self.kmer_embeddings_c[j][i] for j in range(len(X_copy))]


Test Accuracy: 0.6268273916388818


In [5]:
print(f"Test Accuracy:: {accuracy_test * 100:.2f}%")

Test Accuracy:: 62.68%


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Step 1: Data Loading
df = pd.read_pickle("../data/processed/final_processed_data.pkl")
df.drop(df.tail(100).index, inplace=True)

# Step 2: Data Preprocessing
# Convert DNA sequence columns to string type
df['Parent_full_DNA_Seq'] = df['Parent_full_DNA_Seq'].astype(str)
df['Child_full_DNA_Seq'] = df['Child_full_DNA_Seq'].astype(str)

# Load precomputed k-mer embeddings for training data
kmer_embeddings_c = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_c1.npy')
kmer_embeddings_p = np.load('../data/interim/train_vectorizer_np/kmer_embeddings_p1.npy')

# Select relevant columns
df_pre = df[['Parent_full_DNA_Seq', 'Child_full_DNA_Seq', 'target']]

# Split data into train and test sets
x = df_pre.drop('target', axis=1)
y = df_pre['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train = X_train[:len(kmer_embeddings_c)]
y_train = y_train[:len(kmer_embeddings_c)]

# Define a custom transformer to reshape data for CNN input
class ReshapeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.reshape(X.shape[0], X.shape[1], 1)

# Step 3: Model Definition
def create_cnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Pipeline Integration
cnn_pipeline = Pipeline([
    ('reshape', ReshapeTransformer()),
    ('scaler', MinMaxScaler()),  # Add any other preprocessing steps as needed
    ('cnn', tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32))
])

# # Step 5: Training
# cnn_pipeline.fit(X_train, y_train)

# # Step 6: Evaluation
# y_pred_test = cnn_pipeline.predict(X_test)
# accuracy_test = accuracy_score(y_test, y_pred_test)
# print("Test Accuracy:", accuracy_test)


: 

In [None]:
# Step 5: Training
cnn_pipeline.fit(X_train, y_train)

# Step 6: Evaluation
y_pred_test = cnn_pipeline.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)