In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [13]:
def load_data(file_path, species_id):
    sequences = []
    labels = []
    with open(file_path, 'r') as f:
        for line in f:
            # Skip the header line or any non-data line
            if line.startswith('sequence'):  # Assumes the header starts with 'sequence'
                continue
            parts = line.strip().split("\t")
            sequence = parts[0]
            try:
                label = int(parts[1]) + species_id  # Create unique class labels
                sequences.append(sequence)
                labels.append(label)
            except ValueError:
                # Handle the case where the label is not a valid integer
                continue
    return pd.DataFrame({"sequence": sequences, "label": labels})

In [14]:
# Function to generate k-mers from a DNA sequence
def generate_kmers(sequence, k=3):
    return [sequence[i:i+k] for i in range(len(sequence)-k+1)]

In [5]:
# Load data for human, chimpanzee, and dog
human_df = load_data('Data/human.txt', 0)        # Class 0-6 for human
chimpanzee_df = load_data('Data/chimpanzee.txt', 7)  # Class 7-13 for chimpanzee
dog_df = load_data('Data/dog.txt', 14)            # Class 14-17 for dog

# Combine all data into one dataframe
df = pd.concat([human_df, chimpanzee_df, dog_df], ignore_index=True)


In [15]:
df['kmers'] = df['sequence'].apply(lambda seq: ' '.join(generate_kmers(seq, k=3)))


In [16]:
df

Unnamed: 0,sequence,label,kmers
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4,ATG TGC GCC CCC CCC CCA CAA AAC ACT CTA TAA AA...
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4,ATG TGA GAA AAC ACG CGA GAA AAA AAA AAT ATC TC...
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3,ATG TGT GTG TGT GTG TGG GGC GCA CAT ATT TTT TT...
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3,ATG TGT GTG TGT GTG TGG GGC GCA CAT ATT TTT TT...
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3,ATG TGC GCA CAA AAC ACA CAG AGC GCA CAT ATT TT...
...,...,...,...
6877,ATGGTCGGTCCGGAGAAGGAGCAGAGCTGGATCCCTAAGATCTTCA...,19,ATG TGG GGT GTC TCG CGG GGT GTC TCC CCG CGG GG...
6878,ATGGCGGCGACGGTGGCTGCGGCGGCCGCCGACGCGGGGCCGGGGG...,20,ATG TGG GGC GCG CGG GGC GCG CGA GAC ACG CGG GG...
6879,ATGAGCTCGGCCGACAAGGCCCGGGTGGGGCCCGCGGCCGACGGGC...,20,ATG TGA GAG AGC GCT CTC TCG CGG GGC GCC CCG CG...
6880,GCCCCGAGGATGGGCAGGGTCCCGCTGGCCTGGTGCTTGGCGCTGT...,15,GCC CCC CCC CCG CGA GAG AGG GGA GAT ATG TGG GG...


In [17]:
vectorizer = CountVectorizer()


In [18]:
# Convert sequences into a matrix of token counts
X = vectorizer.fit_transform(df['kmers'])

# Target labels (species + class)
y = df['label'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [20]:
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 42.77%


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10)

# Perform cross-validation
cv_scores = cross_val_score(rf_clf, X, y, cv=5, scoring='accuracy')

# Output the average accuracy score from cross-validation
print(f"Average Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%")


Average Cross-Validation Accuracy: 23.41%


In [22]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize the SVM classifier with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform cross-validation
cv_scores_svm = cross_val_score(svm_clf, X, y, cv=5, scoring='accuracy')

# Output the average accuracy score from cross-validation
print(f"Average Cross-Validation Accuracy (SVM): {cv_scores_svm.mean() * 100:.2f}%")


Average Cross-Validation Accuracy (SVM): 29.44%


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# Function to load data from text files
def load_data(file_path, species_id):
    sequences = []
    labels = []
    with open(file_path, 'r') as f:
        # Skip the header line if it exists
        next(f)
        for line in f:
            parts = line.strip().split("\t")
            # Ensure the second column contains numeric values for labels
            try:
                sequence = parts[0]
                label = species_id + int(parts[1])  # Create unique class labels
                sequences.append(sequence)
                labels.append(label)
            except ValueError:
                print(f"Skipping line with invalid label: {line.strip()}")
    return pd.DataFrame({'sequence': sequences, 'label': labels})

# Function to extract k-mers (default size 6)
def get_kmers(sequence, size=6):
    return [sequence[i:i+size].lower() for i in range(len(sequence) - size + 1)]

# Load the human, chimpanzee, and dog data files
human_df = load_data('Data/human.txt', 0)  # Human with class 0-6
chimpanzee_df = load_data('Data/chimpanzee.txt', 7)  # Chimpanzee with class 7-13
dog_df = load_data('Data/dog.txt', 14)  # Dog with class 14-17

# Apply k-mer extraction to each dataset
human_df['words'] = human_df.apply(lambda x: get_kmers(x['sequence']), axis=1)
chimpanzee_df['words'] = chimpanzee_df.apply(lambda x: get_kmers(x['sequence']), axis=1)
dog_df['words'] = dog_df.apply(lambda x: get_kmers(x['sequence']), axis=1)

# Drop the original sequences as they are no longer needed
human_df = human_df.drop('sequence', axis=1)
chimpanzee_df = chimpanzee_df.drop('sequence', axis=1)
dog_df = dog_df.drop('sequence', axis=1)

# Convert lists of k-mers into a space-separated string for vectorization
human_texts = [' '.join(x) for x in human_df['words']]
chimp_texts = [' '.join(x) for x in chimpanzee_df['words']]
dog_texts = [' '.join(x) for x in dog_df['words']]

# Extract labels (assuming first column is the label)
y_human = human_df.iloc[:, 0].values
y_chimp = chimpanzee_df.iloc[:, 0].values
y_dog = dog_df.iloc[:, 0].values

# Combine all data (human, chimpanzee, dog)
X = human_texts + chimp_texts + dog_texts
y = list(y_human) + list(y_chimp) + list(y_dog)

# Use CountVectorizer to create the feature matrix (6-mers as n-grams)
vectorizer = CountVectorizer(ngram_range=(6, 6))  # Extract 6-mers (6-grams)
X_vectorized = vectorizer.fit_transform(X)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Evaluate model using cross-validation
cv_accuracy = cross_val_score(clf, X_vectorized, y, cv=5, scoring='accuracy').mean()
print(f"Average Cross-Validation Accuracy: {cv_accuracy * 100:.2f}%")

# Train the model on the full dataset and print accuracy
clf.fit(X_vectorized, y)
accuracy = clf.score(X_vectorized, y)
print(f"Accuracy: {accuracy * 100:.2f}%")
