<a href="https://colab.research.google.com/github/Angad-2002/Gene_ML/blob/main/Gene_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def MNBC (file1, file2, file3):

    S1, S2, S3 = "", "", ""

    file1 = open (file1.name);
    file2 = open (file2.name);
    file3 = open (file3.name);

    human_dna = pd.read_table (file1)
    human_dna.head ()

    chimp_dna = pd.read_table (file2)
    chimp_dna.head ()

    dog_dna = pd.read_table (file3)
    dog_dna.head ()


    #K - mer List function has been created. (K = 6 (hexamer))
    def Kmers_funct (seq, size = 6):
        return [seq[x:x + size].lower () for x in range (len (seq) - size + 1)]

    #Applying the above function on each dataset.
    human_dna['words'] = human_dna.apply (lambda x: Kmers_funct (x['sequence']), axis = 1)
    human_dna = human_dna.drop ('sequence', axis = 1)
    chimp_dna['words'] = chimp_dna.apply (lambda x: Kmers_funct (x['sequence']), axis = 1)
    chimp_dna = chimp_dna.drop ('sequence', axis = 1)
    dog_dna['words'] = dog_dna.apply (lambda x: Kmers_funct (x['sequence']), axis = 1)
    dog_dna = dog_dna.drop ('sequence', axis = 1)

    #Converting the K - mer list into string sentences of words for conversion into Bag of Words Model.
    #Y labels are used as class labels.

    human_texts = list (human_dna['words'])
    for item in range (len (human_texts)):
        human_texts[item] = ' '.join (human_texts[item])

    y_human = human_dna.iloc[:, 0].values # y_human for human_dna


    chimp_texts = list (chimp_dna['words'])
    for item in range (len (chimp_texts)):
        chimp_texts[item] = ' '.join (chimp_texts[item])

    y_chim = chimp_dna.iloc[:, 0].values # y_chim for chimp_dna

    dog_texts = list (dog_dna['words'])
    for item in range (len (dog_texts)):
        dog_texts[item] = ' '.join (dog_texts[item])

    y_dog = dog_dna.iloc[:, 0].values  # y_dog for dog_dna


    # Creating the Bag of Words model using CountVectorizer()
    # This is equivalent to k-mer counting
    # The n-gram size of 4 was previously determined by rigorous testing (hit and trial)

    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer (ngram_range = (4,4))
    X = cv.fit_transform (human_texts)
    X_chimp = cv.transform (chimp_texts)
    X_dog = cv.transform (dog_texts)

    human_dna['class'].value_counts().sort_index().plot.bar()

    chimp_dna['class'].value_counts().sort_index().plot.bar()

    dog_dna['class'].value_counts ().sort_index ().plot.bar ()


    #Now we will use the Classification model with 85% human dataset for training and 15% for testing.
    #Then we will use the other datsets for testing.
    #Lastly we will do performance analyis.

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split (X, y_human, test_size = 0.15, random_state = 42)


    #By Rigourous trials of different values alpha = 0.1 was decided to be the optimal value.

    from sklearn.naive_bayes import MultinomialNB
    classifier = MultinomialNB (alpha=0.1)
    classifier.fit (X_train, y_train)

    #Now let's make predictions on the human hold out test set and see how it performes on unseen data.

    y_pred = classifier.predict (X_test)

    """
    Okay, so let's look at some model performance metrics like the confusion matrix, accuracy, precision, recall and f1 score. We are getting really good results on our unseen data, so it looks like our model did not overfit to the training data. In a real project I would go back and sample many more train test splits since we have a relatively small data set.

    """
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

    D1 = pd.crosstab (pd.Series (y_test, name = 'Actual'), pd.Series (y_pred, name = 'Predicted'))

    def get_metrics (y_test, y_predicted):
        accuracy = accuracy_score (y_test, y_predicted)
        precision = precision_score (y_test, y_predicted, average = 'weighted')
        recall = recall_score (y_test, y_predicted, average = 'weighted')
        f1 = f1_score (y_test, y_predicted, average = 'weighted')
        return accuracy, precision, recall, f1

    accuracy, precision, recall, f1 = get_metrics (y_test, y_pred)
    S1 += "accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f\n" % (accuracy, precision, recall, f1) + "\n"


    # Predicting the chimp and dog sequences.

    y_pred_chimp = classifier.predict (X_chimp)
    y_pred_dog = classifier.predict (X_dog)

    # performance on chimp genes
    D2 = pd.crosstab (pd.Series (y_chim, name = 'Actual'), pd.Series (y_pred_chimp, name = 'Predicted'))
    accuracy, precision, recall, f1 = get_metrics (y_chim, y_pred_chimp)
    S2 += "accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f\n" % (accuracy, precision, recall, f1) + "\n"

    # performance on dog genes
    D3 = pd.crosstab (pd.Series (y_dog, name = 'Actual'), pd.Series (y_pred_dog, name = 'Predicted'))
    accuracy, precision, recall, f1 = get_metrics (y_dog, y_pred_dog)
    S3 += "accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f\n" % (accuracy, precision, recall, f1) + "\n"

    return (D1, S1, D2, S2, D3, S3)

In [None]:
import gradio as gr
input = [gr.File (label = "Human Dataset"), gr.File (label = "Chimpanzee Dataset"), gr.File (label = "Dog Dataset")]
output = [gr.Dataframe (label = "Confusion Matrix for Humans"), gr.Text (label = "Precision Metrics for Humans"),
          gr.Dataframe (label = "Confusion Matrix for Chimpanzees"), gr.Text (label = "Precision Metrics for Chimpanzees"),
          gr.Dataframe (label = "Confusion Matrix for Dogs"), gr.Text (label = "Precision Metrics for Dogs")]
interface = gr.Interface (fn = MNBC, inputs = input, outputs = output)
interface.launch (share = True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d1fc174cdc3783134b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


