In [16]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import pandas as pd
import numpy as np
from scipy.fft import fft
from sklearn.manifold import MDS
from sklearn.model_selection import train_test_split
import plotly.express as px

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_validate

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import sklearn
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report
import os

def kmers(k):
    '''
    Return a list of all possible substrings of
    length k using only characters A, C, T, and G
    '''
    bases = ["A", "C", "T", "G"]

    last = bases
    current = []
    for i in range(k-1):
        for b in bases:
            for l in last:
                current.append(l+b)
        last = current
        current= []
    return last
class DNA2Neumerical:
    def read_data(self, dnadir):
        """
        Reads the data from the directory and returns a list of DNA sequences
        and a list of labels.
        """
        X = []
        Y = []
        for filename in os.listdir(dnadir):
            if filename.endswith(".txt"):
                with open(dnadir+filename) as f:
                    for line in f:
                        if line[0] == ">":
                            X.append(line[1:-1])
                        else:
                            X.append(line[:-1])
                        Y.append(filename.split("_")[0])
        return X,Y
        
    def xandy(self, X,Y,M=None):
        """
        X is expected to be an array of strings, containing Neucleotides
        Y is expected to be an array of Labels, label from the filename.
        
        """
        self.X = X
        self.Y_raw = Y
        self.le = preprocessing.LabelEncoder()
        self.Y  = self.le.fit_transform(self.Y_raw)
        if M is not None:
            self.M = M
        else:
            self.M = len(max(self.X, key=len))

       
    def to_integer(self):
        """
        Returns: 
        X: ND-Array of size NxM, N-> Number of sequences in X. M defaults to 
        the length of the longest sequence encountered. Can be specified in constructor
        otherwise.
        """
        
        try:
            X_ = []
            for seq in self.X:
                int_rep = [0]*self.M
                for i,ch in enumerate(seq):
                    if ch== "A":
                        int_rep[i] = 1
                    elif ch=="C":
                        int_rep[i] = 2
                    elif ch=="G":
                        int_rep[i]= 3
                    elif ch=="T":
                        int_rep[i] = 4
                X_.append(int_rep)
            return np.asarray(X_) 
        except IndexError:
            print("M not sufficiently Large.")

    def train(self, X,Y,feature_name,k):
          Xtrain =X
          Ytrain = Y

          scaler = StandardScaler() #used to scale the data

          kfold = RepeatedKFold(  #used to split the data into k folds, k fold is used to train the model 
            n_splits = k,n_repeats=10)
          lda = LinearDiscriminantAnalysis() #used to seperate the classes efficiently

          c=1
          svm_quad = svm.SVC(kernel='poly',degree=2, C=c,probability=True)

          c=1
          svm_linear = svm.SVC(kernel='linear',C=c,probability=True)

          knn = KNeighborsClassifier(n_neighbors = 4)

          lda1 = LinearDiscriminantAnalysis() 
          subspace_lda  = BaggingClassifier(base_estimator = lda1 , n_estimators = 30,max_features=(1/30),bootstrap=False)

          knn1  =  KNeighborsClassifier(n_neighbors = 11)
          subspace_knn  = BaggingClassifier(base_estimator = knn1 , n_estimators = 30,max_features=(1/30),bootstrap=False)

          lda = lda.fit(Xtrain,Ytrain)
          svm_linear  = svm_linear.fit(scaler.fit_transform(Xtrain),Ytrain)
          svm_quad = svm_quad.fit(scaler.fit_transform(Xtrain),Ytrain)
          knn = knn.fit(scaler.fit_transform(Xtrain),Ytrain)
          subspace_lda = subspace_lda.fit(Xtrain,Ytrain)
          subspace_knn = subspace_knn.fit(scaler.fit_transform(Xtrain) , Ytrain)

          models =  [lda ,svm_linear, svm_quad,knn,subspace_lda,knn,subspace_knn]
          model_names = ['Linear Discriminant' , 'Linear SVM' , 'Quadratic SVM' , 'Subspace discriminant' , 'KNN', 'Subspace KNN']
          scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score),
               'recall' : make_scorer(recall_score), 
               'f1_score' : make_scorer(f1_score)}
          scores={}  
          for model,model_name in zip(models,model_names):
               score = sklearn.model_selection.cross_validate(model,scaler.fit_transform(Xtrain),Ytrain.ravel(),cv=kfold,scoring=( 'accuracy' ))
               print( model_name , ' ' , score )   
               scores.update({model_name:score})
          return models,scores
    def to_proba(self):
        """
        Returns:
        Normalized ND array of size Nx4. ith row contains the probablity of oobserving the
        neuceotide A,C,G,T based on the sequence. 
        """

        X_ = []
        for seq in self.X:
                n = len(seq)
                X_.append( [ seq.count("A")/n, seq.count("C")/n, seq.count("G")/n, seq.count("T")/n])
        return X_
    
    def kmer_proba(self,k):
        """
        k: length of kmer
        
        Returns:
        Normalized ND array of size 4^k.
        """
        Kmers = kmers(k)
        X_ = []
        for seq in self.X:
            x_ = []
            n = len(seq)
            for km in Kmers:
                x_.append([seq.count(km)])
            X_.append(x_)
        return np.array(X_).reshape((-1,4**k))
            
    def main(self):
        X,Y = self.read_data("/home/naylak/Documents/test1_maxcliq_rep/")
        self.xandy(X,Y)
        k=3
        X = self.kmer_proba(k)
        self.train(X,self.Y,"k",10)

        
        """
        #giving only 70% of the data for training using train def and using lda model
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, self.Y, test_size=0.3, random_state=42) 
        #using lda model
        print("________________________________________________LDA Model_____________________________________________________________________________")
        lda = LinearDiscriminantAnalysis()
        lda.fit(Xtrain,Ytrain)
        Ypred = lda.predict(Xtest)
        print(classification_report(Ytest, Ypred))
        #using svm model
        print("________________________________________________SVM_Linear Model_____________________________________________________________________________")
        svm_linear = svm.SVC(kernel='linear',C=1,probability=True)
        svm_linear.fit(Xtrain,Ytrain)
        Ypred = svm_linear.predict(Xtest)
        print(classification_report(Ytest, Ypred))
        #using knn model
        print("________________________________________________KNN Model_____________________________________________________________________________")
        knn = KNeighborsClassifier(n_neighbors = 4)
        knn.fit(Xtrain,Ytrain)
        Ypred = knn.predict(Xtest)
        print(classification_report(Ytest, Ypred))
        
        #using subspace_lda model
        print("________________________________________________Subspace_LDA Model_____________________________________________________________________________")
        lda1 = LinearDiscriminantAnalysis()
        subspace_lda  = BaggingClassifier(base_estimator = lda1 , n_estimators = 30,max_features=(1/30),bootstrap=False)
        subspace_lda.fit(Xtrain,Ytrain)
        Ypred = subspace_lda.predict(Xtest)
        print(classification_report(Ytest, Ypred))
        #using quad_svm
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, self.Y, test_size=0.3, random_state=42)
        print("________________________________________________SVM_Quadratic Model_____________________________________________________________________________")
        svm_quad = svm.SVC(kernel='poly',degree=2, C=1,probability=True)
        svm_quad.fit(Xtrain,Ytrain)
        Ypred = svm_quad.predict(Xtest)
        print(classification_report(Ytest, Ypred))"""


In [17]:
a = DNA2Neumerical()
a.main()


Linear Discriminant   {'fit_time': array([0.02476621, 0.05531526, 0.055022  , 0.05206895, 0.05050421,
       0.07253766, 0.05395889, 0.05649805, 0.053653  , 0.05160165,
       0.05164623, 0.05473375, 0.07341862, 0.05210996, 0.06198907,
       0.11860442, 0.06599236, 0.05780125, 0.05282879, 0.07955313,
       0.05325532, 0.05631137, 0.05568504, 0.05635285, 0.07968187,
       0.05569553, 0.05341506, 0.06021333, 0.05406547, 0.07033062,
       0.08790612, 0.06613445, 0.08116102, 0.06690502, 0.08456087,
       0.05720282, 0.05561805, 0.07184935, 0.05830336, 0.0576365 ,
       0.05612469, 0.05635715, 0.06026673, 0.07551885, 0.09724951,
       0.06207061, 0.07608819, 0.05684352, 0.05855322, 0.09273601,
       0.0599463 , 0.0539434 , 0.05549741, 0.05796123, 0.07240152,
       0.05735779, 0.0535388 , 0.05849528, 0.05506206, 0.08169913,
       0.05972695, 0.08218694, 0.05278492, 0.05931497, 0.07358456,
       0.05845404, 0.0570538 , 0.09060311, 0.06332088, 0.06262231,
       0.05690002, 0.052908

In [7]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import pandas as pd
import numpy as np
from scipy.fft import fft
from sklearn.manifold import MDS
from sklearn.model_selection import train_test_split
import plotly.express as px

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_validate

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import sklearn
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report
import os

class DNA2Neumerical:
    def read_data(self, dnadir):
        """
        Reads the data from the directory and returns a list of DNA sequences
        and a list of labels.
        """
        X = []
        Y = []
        for filename in os.listdir(dnadir):
            if filename.endswith(".txt"):
                with open(dnadir+filename) as f:
                    for line in f:
                        if line[0] == ">":
                            X.append(line[1:-1])
                        else:
                            X.append(line[:-1])
                        Y.append(filename.split("_")[0])
        return X,Y
        
    def xandy(self, X,Y,M=None):
        """
        X is expected to be an array of strings, containing Neucleotides
        Y is expected to be an array of Labels, label from the filename.
        
        """
        self.X = X
        self.Y_raw = Y
        self.le = preprocessing.LabelEncoder()
        self.Y  = self.le.fit_transform(self.Y_raw)
        if M is not None:
            self.M = M
        else:
            self.M = len(max(self.X, key=len))

       
    def to_integer(self):
        """
        Returns: 
        X: ND-Array of size NxM, N-> Number of sequences in X. M defaults to 
        the length of the longest sequence encountered. Can be specified in constructor
        otherwise.
        """
        
        try:
            X_ = []
            for seq in self.X:
                int_rep = [0]*self.M
                for i,ch in enumerate(seq):
                    if ch== "A":
                        int_rep[i] = 1
                    elif ch=="C":
                        int_rep[i] = 2
                    elif ch=="G":
                        int_rep[i]= 3
                    elif ch=="T":
                        int_rep[i] = 4
                X_.append(int_rep)
            return np.asarray(X_) 
        except IndexError:
            print("M not sufficiently Large.")

    def train(self, X,Y,feature_name,k):
          Xtrain =X
          Ytrain = Y

          scaler = StandardScaler() #used to scale the data

          kfold = RepeatedKFold(  #used to split the data into k folds, k fold is used to train the model 
            n_splits = k,n_repeats=10)
          lda = LinearDiscriminantAnalysis() #used to seperate the classes efficiently

          c=1
          svm_quad = svm.SVC(kernel='poly',degree=2, C=c,probability=True)

          c=1
          svm_linear = svm.SVC(kernel='linear',C=c,probability=True)

          knn = KNeighborsClassifier(n_neighbors = 4)

          lda1 = LinearDiscriminantAnalysis() 
          subspace_lda  = BaggingClassifier(base_estimator = lda1 , n_estimators = 30,max_features=(1/30),bootstrap=False)

          knn1  =  KNeighborsClassifier(n_neighbors = 11)
          subspace_knn  = BaggingClassifier(base_estimator = knn1 , n_estimators = 30,max_features=(1/30),bootstrap=False)

          lda = lda.fit(Xtrain,Ytrain)
          svm_linear  = svm_linear.fit(scaler.fit_transform(Xtrain),Ytrain)
          svm_quad = svm_quad.fit(scaler.fit_transform(Xtrain),Ytrain)
          knn = knn.fit(scaler.fit_transform(Xtrain),Ytrain)
          subspace_lda = subspace_lda.fit(Xtrain,Ytrain)
          subspace_knn = subspace_knn.fit(scaler.fit_transform(Xtrain) , Ytrain)

          models =  [lda ,svm_linear, svm_quad,knn,subspace_lda,knn,subspace_knn]
          model_names = ['Linear Discriminant' , 'Linear SVM' , 'Quadratic SVM' , 'Subspace discriminant' , 'KNN', 'Subspace KNN']
          scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score),
               'recall' : make_scorer(recall_score), 
               'f1_score' : make_scorer(f1_score)}
          scores={}  
          for model,model_name in zip(models,model_names):
               score = sklearn.model_selection.cross_validate(model,scaler.fit_transform(Xtrain),Ytrain.ravel(),cv=kfold,scoring=( 'accuracy' ))
               print( model_name , ' ' , score )   
               scores.update({model_name:score})
          return models,scores
    def to_proba(self):
        """
        Returns:
        Normalized ND array of size Nx4. ith row contains the probablity of oobserving the
        neuceotide A,C,G,T based on the sequence. 
        """
        X_ = []
        for seq in self.X:
                n = len(seq)
                X_.append( [ seq.count("A")/n, seq.count("C")/n, seq.count("G")/n, seq.count("T")/n])
        return X_
    
    def main(self):
        X,Y = self.read_data("/home/naylak/Documents/test1_maxcliq_rep/")
        self.xandy(X,Y)
        X = self.to_integer()
        self.train(X,self.Y,"DNA",10)
        X = self.to_proba()
        self.train(X,self.Y,"DNA",10)
        
        #using quad_svm
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, self.Y, test_size=0.3, random_state=42)
        print("________________________________________________SVM_Quadratic Model_____________________________________________________________________________")
        svm_quad = svm.SVC(kernel='poly',degree=2, C=1,probability=True)
        svm_quad.fit(Xtrain,Ytrain)
        Ypred = svm_quad.predict(Xtest)
        print(classification_report(Ytest, Ypred))