<a href="https://colab.research.google.com/github/DockingBlade/Gene-Sequencing-/blob/main/Deep_Gene_Sequencing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import torch
from numpy import array, newaxis, expand_dims

In [2]:
df = pd.read_csv('splice.csv')

In [3]:
numDataSamples = df.shape[0];
numAttributes = df.shape[1];

print('The number of Data Samples is ', numDataSamples);
print('The number of Attributes is ', numAttributes, ' but the Donor variable is not relevant to classification, since it is an identifier for the sample and does not have information relevant to the sample so there are really ', numAttributes-1);
print( 'attributes with 1 attribute as the Label or Dependent variable and ', numAttributes-2, ' independent variables, in this case nucleotides');



#Checing that all the nucleotide values are valid

The number of Data Samples is  3190
The number of Attributes is  62  but the Donor variable is not relevant to classification, since it is an identifier for the sample and does not have information relevant to the sample so there are really  61
attributes with 1 attribute as the Label or Dependent variable and  60  independent variables, in this case nucleotides


In [4]:
#Pre-Processing

#Checing that all the nucleotide values are valid
for col in range(1,61):
    index = str(col);
    df.loc[((df[index] != 'A') & (df[index] != 'C') & (df[index] != 'G') & (df[index] != 'T')), index]= pd.NA

if df.isnull().values.any():
    print('Dataframe contains invalid values in the nucleotide sequences')
else:
    print('All entries in the nucleotide sequences are valid.')

df = df.dropna();

print('So in the dataset set description it mentions that nucleotides with abigious values were given different character values than A,G,C, and T. Thus I decided to remove these rows from the dataset, and now there are ',df.shape[0] , ' samples remaining');

index = 'Label';
df.loc[((df[index] != 'IE') & (df[index] != 'EI') & (df[index] != 'N') ), index]= pd.NA

if df.isnull().values.any():
    print('Dataframe contains invalid values in the Labels')
else:
    print('All entries in the Labels are now valid.')



#Label encoding the nucleotides
for col in range(1,61):
    index = str(col);
    df[index] = df[index].replace(['A','C','G','T'],[0, 1, 2, 3]);



df['Label'] = df['Label'].replace(['IE','EI','N'],[0, 1, 2]);

Dataframe contains invalid values in the nucleotide sequences
So in the dataset set description it mentions that nucleotides with abigious values were given different character values than A,G,C, and T. Thus I decided to remove these rows from the dataset, and now there are  3175  samples remaining
All entries in the Labels are now valid.


In [5]:
#Feature Learning - PCA



nucleotides = df.iloc[:,np.r_[2:62]].to_numpy();
labels = df.iloc[:,np.r_[0]].to_numpy();

mean_datascaler = sklpp.StandardScaler(with_mean=True, with_std=False);
data_pca = skldecomp.PCA(n_components=0.95, svd_solver='full');

X_train, X_test, y_train, y_test = train_test_split(nucleotides, labels, test_size=0.20, shuffle=True);

X_train = mean_datascaler.fit_transform(X_train);
X_train =  data_pca.fit_transform(X_train);
mean = np.array(mean_datascaler.mean_);

test_X = X_test.astype(np.float64, copy=False)
for i in range(np.shape(test_X)[0]):
  test_X[i,:] -= mean;

U = np.transpose(data_pca.components_);
U_transpose = np.transpose(U);
Xtest_transpose = U_transpose @ np.transpose(test_X);
X_test = np.transpose(Xtest_transpose);

print('Shape of X_train,\n', X_train.shape);




Shape of X_train,
 (2540, 56)


In [15]:
class Net(torch.nn.Module):
  def __init__(self, n_features):
    super(Net,self).__init__();
    self.lin_1 = torch.nn.Conv1d(56,64,1,2);
    self.lin_2 = torch.nn.Conv1d(64,128,1,2);
    self.lin_3 = torch.nn.Conv1d(128,256,1,2);
    self.lin_4 = torch.nn.Conv1d(256,512,1,2);
    self.lin_5 = torch.nn.Conv1d(512,1024,1,2);
    
    self.lin_6 = torch.nn.Linear(1024,512);
    self.lin_7 = torch.nn.Linear(512,256);
    self.lin_8 = torch.nn.Linear(256,128);
    self.lin_9 = torch.nn.Linear(128,64);
    self.lin_10 = torch.nn.Linear(64,3);

    self.dropout = torch.nn.Dropout(0.2)

  def forward(self,x) :
    x = torch.nn.functional.tanh( self.lin_1(x));
    x = torch.nn.functional.tanh( self.lin_2(x));
    x = torch.nn.functional.tanh( self.lin_3(x));
    x = torch.nn.functional.tanh( self.lin_4(x));
    x = torch.nn.functional.tanh( self.lin_5(x));
    
    x= x.view(x.size()[0], -1)
    
    x = torch.nn.functional.tanh( self.dropout(self.lin_6(x)));
    x = torch.nn.functional.tanh( self.dropout(self.lin_7(x)));
    x = torch.nn.functional.tanh( self.dropout(self.lin_8(x)));
    x = torch.nn.functional.tanh( self.dropout(self.lin_9(x)));
    x = torch.nn.functional.tanh(self.dropout(self.lin_10(x)));
    return x; 

In [7]:
X_train = torch.from_numpy(X_train);
X_test = torch.from_numpy(X_test);
y_train = torch.from_numpy(np.transpose(y_train));
y_test = torch.from_numpy(np.transpose(y_test));
print(y_train);
print(y_test);

tensor([[2, 1, 0,  ..., 2, 2, 2]])
tensor([[2, 2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 0, 2, 0, 2, 0, 1, 0,
         2, 2, 1, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0,
         2, 0, 2, 2, 1, 0, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2,
         2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 0, 2, 1, 1, 2, 1, 1, 0, 2, 2, 0, 2, 2,
         0, 0, 0, 0, 2, 2, 1, 2, 2, 0, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2,
         2, 2, 1, 2, 0, 2, 0, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 2, 1, 0, 0, 2, 0, 1,
         0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 1, 1, 2,
         0, 1, 0, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 1, 1,
         1, 2, 0, 2, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 0, 0, 0, 2, 1, 1, 2, 2, 0, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 0, 2, 1, 2, 0, 2, 0, 1,
         1, 2, 2, 2, 2, 2, 0, 0, 2, 1, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 1, 1, 2, 2,
         2, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1, 0, 0, 0, 2, 2, 1, 2, 1, 0, 2, 0,

In [8]:
y_train = y_train.flatten();
y_test = y_test.flatten();


In [16]:
net = Net(X_train.shape[1]);
loss = torch.nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(net.parameters(),lr = 0.001);


In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu");
X_train.to(device);
X_test.to(device);
y_train.to(device);
y_test.to(device);

In [17]:
net.double();
net.train()
for epoch in range(60):
  print(epoch);
  #pred = torch.max(net(X_train),1);
  pred = net(torch.from_numpy(np.array(X_train)[:, :, newaxis]));
  train_loss = loss(pred,y_train);
  optimizer.zero_grad()
  train_loss.backward()
  optimizer.step()

0




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [18]:
net.eval();
pred = torch.max(net(torch.from_numpy(np.array(X_test)[:, :, newaxis])),1)[1];

Recall = recall_score(y_test,pred,average= 'micro');
Precision = precision_score(y_test,pred,average='micro');
F_score = (2 * Recall * Precision)/(Recall + Precision);
print('The accuracy for deep learning is ' , accuracy_score(y_test,pred));
print('The confusion matrix for deep learning is \n', confusion_matrix(y_test,pred, normalize='true'))
print('The precision for deep learning is ',precision_score(y_test,pred, average='micro'));
print('The recall for deep learning is ', recall_score(y_test,pred, average='micro') );
print('The F score for deep learning is ', F_score);
# for layer in net.children():
#   if hasattr(layer, 'reset_parameters'):
#     layer.reset_parameters()



The accuracy for deep learning is  0.8346456692913385
The confusion matrix for deep learning is 
 [[0.82352941 0.06535948 0.11111111]
 [0.06451613 0.86451613 0.07096774]
 [0.11009174 0.06422018 0.82568807]]
The precision for deep learning is  0.8346456692913385
The recall for deep learning is  0.8346456692913385
The F score for deep learning is  0.8346456692913384


Deep learning ended up performing just as well as the other methods, but no better. I tried to expand the size of the network but this caused the network to label all the Gene Sequences as Neither (the largest class). Furthermore, I had the same issue before I changed the activation function from ReLu to tanh. My theory is that since all the outputs have all positive and 0 elements, and the inputs have all positive and 0 elements, the network was learning mostly positive values for all the parameters, and thus ReLu was minimally in creating non-linearities. 