<a href="https://colab.research.google.com/github/BioGeMT/DLforGenomics/blob/dev/notebooks/Computing_k_mers_and_using_ML_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/BioGeMT/DLforGenomics/main/data/Helwak_2013/one_miRNA/miRNA_train_set.tsv -O miRNA_train_set.tsv

--2024-05-30 14:55:45--  https://raw.githubusercontent.com/BioGeMT/DLforGenomics/main/data/Helwak_2013/one_miRNA/miRNA_train_set.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101082 (99K) [text/plain]
Saving to: ‘miRNA_train_set.tsv’


2024-05-30 14:55:46 (6.39 MB/s) - ‘miRNA_train_set.tsv’ saved [101082/101082]



In [None]:
import pandas as pd

df = pd.read_csv('miRNA_train_set.tsv', sep='\t')
df

Unnamed: 0,gene,label
0,AGCACTGCCGCCGGGGACTGCTCAGCAACCACACCGGCAGCCCGCG...,1
1,CTTCTCGGAGACGGTGCGCATCATCAACCGCAAGGTGAAGCCGCGG...,1
2,GGGCTGGGCAAAGAATGTGCAAAAGTCTTCTATGCTGCGGGTGCTA...,1
3,TCCACTAGAAGGCTGGGACAGCACCGGTGATTACTGTCTTTCCTGC...,1
4,AGTTCACAGGCTTTGTGGACATGTGTGTGCAGCATATCCCTTCTCC...,1
...,...,...
1902,TCTAAGCCATCCAGTGCCATCCTCGTCGCTGCAGCGACACACGCTC...,0
1903,AACCACGTCTCCTACTTTCCAAACCCATGGCAGTGTCCCTGCTCCA...,0
1904,ATTGAAGCCTGCCCATCCTCCCATGAGAGACTCTTGTTAGTCAACA...,0
1905,GCCAGCCCTACACTCGCCCGCGCCATGGCCTCTGTCTCCGAGCTCG...,0


In [None]:
from collections import Counter

def generate_kmers(sequence, k):
    """Generate k-mers from a given sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def count_kmers(sequence, k):
    """Count k-mers in the given sequence."""
    kmers = generate_kmers(sequence, k)
    return Counter(kmers)

# Set the value of k
k = 6

# Apply the k-mer counting to each sequence in the DataFrame
df['kmer_counts'] = df['gene'].apply(lambda seq: count_kmers(seq, k))

# Display the first few rows of the DataFrame with k-mer counts
df.head()

Unnamed: 0,gene,label,kmer_counts
0,AGCACTGCCGCCGGGGACTGCTCAGCAACCACACCGGCAGCCCGCG...,1,"{'AGCACT': 1, 'GCACTG': 1, 'CACTGC': 1, 'ACTGC..."
1,CTTCTCGGAGACGGTGCGCATCATCAACCGCAAGGTGAAGCCGCGG...,1,"{'CTTCTC': 1, 'TTCTCG': 1, 'TCTCGG': 1, 'CTCGG..."
2,GGGCTGGGCAAAGAATGTGCAAAAGTCTTCTATGCTGCGGGTGCTA...,1,"{'GGGCTG': 1, 'GGCTGG': 1, 'GCTGGG': 1, 'CTGGG..."
3,TCCACTAGAAGGCTGGGACAGCACCGGTGATTACTGTCTTTCCTGC...,1,"{'TCCACT': 1, 'CCACTA': 1, 'CACTAG': 1, 'ACTAG..."
4,AGTTCACAGGCTTTGTGGACATGTGTGTGCAGCATATCCCTTCTCC...,1,"{'AGTTCA': 1, 'GTTCAC': 1, 'TTCACA': 1, 'TCACA..."


In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Prepare the feature matrix
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(df['kmer_counts'])

# Prepare the labels
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



In [None]:
from sklearn.svm import SVC

# Train a SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Make predictions with SVM
svm_y_pred = svm_classifier.predict(X_test)

# Evaluate the SVM classifier
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)

print(f'Accuracy (SVM): {svm_accuracy}')
print('Classification Report (SVM):')
print(svm_report)

Accuracy (SVM): 0.7329842931937173
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       103
           1       0.70      0.73      0.72        88

    accuracy                           0.73       191
   macro avg       0.73      0.73      0.73       191
weighted avg       0.73      0.73      0.73       191



In [None]:
from sklearn.linear_model import LogisticRegression

# Train a LOGISTIC REGRESSION classifier
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)
lr_classifier.fit(X_train, y_train)

# Make predictions with LOGISTIC REGRESSION
lr_y_pred = lr_classifier.predict(X_test)

# Evaluate the LOGISTIC REGRESSION classifier
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_report = classification_report(y_test, lr_y_pred)

print(f'Accuracy (Logistic Regression): {lr_accuracy}')
print('Classification Report (Logistic Regression):')
print(lr_report)

Accuracy (Logistic Regression): 0.7696335078534031
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       103
           1       0.75      0.75      0.75        88

    accuracy                           0.77       191
   macro avg       0.77      0.77      0.77       191
weighted avg       0.77      0.77      0.77       191



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a RANDOM FOREST classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
rf_y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)

print(f'Accuracy (Random Forest): {rf_accuracy}')
print('Classification Report (Random Forest):')
print(rf_report)

Accuracy (Random Forest): 0.8115183246073299
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.87      0.77      0.81       103
           1       0.76      0.86      0.81        88

    accuracy                           0.81       191
   macro avg       0.81      0.82      0.81       191
weighted avg       0.82      0.81      0.81       191



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a GRADIENT BOOSTING classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Make predictions with GRADIENT BOOSTING
gb_y_pred = gb_classifier.predict(X_test)

# Evaluate the GRADIENT BOOSTING classifier
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_report = classification_report(y_test, gb_y_pred)

print(f'Accuracy (Gradient Boosting): {gb_accuracy}')
print('Classification Report (Gradient Boosting):')
print(gb_report)

Accuracy (Gradient Boosting): 0.774869109947644
Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       103
           1       0.79      0.69      0.74        88

    accuracy                           0.77       191
   macro avg       0.78      0.77      0.77       191
weighted avg       0.78      0.77      0.77       191



In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train a K-NEAREST NEIGHBORS classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Make predictions with K-NEAREST NEIGHBORS
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate the K-NEAREST NEIGHBORS classifier
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_report = classification_report(y_test, knn_y_pred)

print(f'Accuracy (K-Nearest Neighbors): {knn_accuracy}')
print('Classification Report (K-Nearest Neighbors):')
print(knn_report)

Accuracy (K-Nearest Neighbors): 0.6649214659685864
Classification Report (K-Nearest Neighbors):
              precision    recall  f1-score   support

           0       0.71      0.65      0.68       103
           1       0.62      0.68      0.65        88

    accuracy                           0.66       191
   macro avg       0.67      0.67      0.66       191
weighted avg       0.67      0.66      0.67       191



In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train a NAIVE BAYES classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Make predictions with NAIVE BAYES
nb_y_pred = nb_classifier.predict(X_test)

# Evaluate the NAIVE BAYES classifier
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)

print(f'Accuracy (Naive Bayes): {nb_accuracy}')
print('Classification Report (Naive Bayes):')
print(nb_report)

Accuracy (Naive Bayes): 0.7120418848167539
Classification Report (Naive Bayes):
              precision    recall  f1-score   support

           0       0.74      0.72      0.73       103
           1       0.68      0.70      0.69        88

    accuracy                           0.71       191
   macro avg       0.71      0.71      0.71       191
weighted avg       0.71      0.71      0.71       191



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train a DECISION TREE classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions with DECISION TREE
dt_y_pred = dt_classifier.predict(X_test)

# Evaluate the DECISION TREE classifier
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_report = classification_report(y_test, dt_y_pred)

print(f'Accuracy (Decision Tree): {dt_accuracy}')
print('Classification Report (Decision Tree):')
print(dt_report)

Accuracy (Decision Tree): 0.680628272251309
Classification Report (Decision Tree):
              precision    recall  f1-score   support

           0       0.74      0.62      0.68       103
           1       0.63      0.75      0.68        88

    accuracy                           0.68       191
   macro avg       0.69      0.69      0.68       191
weighted avg       0.69      0.68      0.68       191



In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Train an ADABOOST classifier
ab_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
ab_classifier.fit(X_train, y_train)

# Make predictions with ADABOOST
ab_y_pred = ab_classifier.predict(X_test)

# Evaluate the ADABOOST classifier
ab_accuracy = accuracy_score(y_test, ab_y_pred)
ab_report = classification_report(y_test, ab_y_pred)

print(f'Accuracy (AdaBoost): {ab_accuracy}')
print('Classification Report (AdaBoost):')
print(ab_report)

Accuracy (AdaBoost): 0.7277486910994765
Classification Report (AdaBoost):
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       103
           1       0.72      0.67      0.69        88

    accuracy                           0.73       191
   macro avg       0.73      0.72      0.72       191
weighted avg       0.73      0.73      0.73       191



In [None]:
import xgboost as xgb

# Train an XGBOOST classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_classifier.fit(X_train, y_train)

# Make predictions with XGBOOST
xgb_y_pred = xgb_classifier.predict(X_test)

# Evaluate the XGBOOST classifier
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_report = classification_report(y_test, xgb_y_pred)

print(f'Accuracy (XGBoost): {xgb_accuracy}')
print('Classification Report (XGBoost):')
print(xgb_report)

Accuracy (XGBoost): 0.8115183246073299
Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       103
           1       0.81      0.77      0.79        88

    accuracy                           0.81       191
   macro avg       0.81      0.81      0.81       191
weighted avg       0.81      0.81      0.81       191



In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, average_precision_score

# Assuming predictions have been made and stored in variables
classifiers = {
    'Random Forest': (y_test, rf_y_pred, rf_classifier.predict_proba(X_test)[:, 1]),
    'Logistic Regression': (y_test, lr_y_pred, lr_classifier.predict_proba(X_test)[:, 1]),
    'SVM': (y_test, svm_y_pred, svm_classifier.decision_function(X_test)),
    'Gradient Boosting': (y_test, gb_y_pred, gb_classifier.predict_proba(X_test)[:, 1]),
    'K-Nearest Neighbors': (y_test, knn_y_pred, knn_classifier.predict_proba(X_test)[:, 1]),
    'Naive Bayes': (y_test, nb_y_pred, nb_classifier.predict_proba(X_test)[:, 1]),
    'Decision Tree': (y_test, dt_y_pred, dt_classifier.predict_proba(X_test)[:, 1]),
    'AdaBoost': (y_test, ab_y_pred, ab_classifier.predict_proba(X_test)[:, 1]),
    'XGBoost': (y_test, xgb_y_pred, xgb_classifier.predict_proba(X_test)[:, 1])
}

results = []

for clf_name, (true_labels, pred_labels, pred_probs) in classifiers.items():
    accuracy = accuracy_score(true_labels, pred_labels)
    auprc = average_precision_score(true_labels, pred_probs)
    results.append({
        'Algorithm': clf_name,
        'Accuracy': accuracy,
        'AUPRC': auprc
    })

results_df = pd.DataFrame(results)

print(results_df)


             Algorithm  Accuracy     AUPRC
0        Random Forest  0.811518  0.856426
1  Logistic Regression  0.769634  0.817499
2                  SVM  0.732984  0.791691
3    Gradient Boosting  0.774869  0.833846
4  K-Nearest Neighbors  0.664921  0.721924
5          Naive Bayes  0.712042  0.787661
6        Decision Tree  0.680628  0.586612
7             AdaBoost  0.727749  0.771115
8              XGBoost  0.811518  0.849115
