<a href="https://colab.research.google.com/github/AmrithaPraveen/ASDScreener_ScienceREACH/blob/main/GutMicrobiome_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[link text](https://)## **Import Libraries**

**Import the required Libraries**

In [25]:
#Pandas and visualization libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
#Neural Networks
from keras.layers import Input, Dense
from keras.models import Model

#Feature reduction models
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

#Cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle

## **Pre-Processing**

**Download and read the sequence File**

> Indented block



In [17]:
abundance_data = pd.read_csv('GSE113690_Autism_16S_rRNA_OTU_assignment_and_abundance.csv')
abundance_data.head()

Unnamed: 0,OTU,taxonomy,A1,A10,A100,A101,A102,A104,A105,A106,...,B52,B54,B55,B56,B57,B58,B59,B6,B60,B61
0,OTU1,d__Bacteria;_k__norank;_p__Firmicutes;_c__Clos...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,OTU2,d__Bacteria;_k__norank;_p__Proteobacteria;_c__...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,OTU3,d__Bacteria;_k__norank;_p__Firmicutes;_c__Erys...,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,OTU4,d__Bacteria;_k__norank;_p__Firmicutes;_c__Baci...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,OTU5,d__Bacteria;_k__norank;_p__Tenericutes;_c__Mol...,0,0,1,0,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0


**Transpose the data so that OTU is the feature**

In [18]:
taxa = abundance_data[['OTU', 'taxonomy']].set_index('OTU')
abundance_taxa = abundance_data.drop('taxonomy', axis=1).set_index('OTU').transpose()

abundance_taxa.head()

OTU,OTU1,OTU2,OTU3,OTU4,OTU5,OTU6,OTU7,OTU8,OTU9,OTU10,...,OTU1329,OTU1330,OTU1331,OTU1332,OTU1333,OTU1334,OTU1335,OTU1336,OTU1337,OTU1338
A1,0,0,0,0,0,1,0,0,50,0,...,1,0,1,0,0,0,0,0,0,0
A10,0,0,0,0,0,0,0,0,10,2,...,5,0,0,0,0,0,0,0,0,0
A100,0,0,0,0,1,0,0,485,13,0,...,0,0,0,0,0,0,0,0,0,0
A101,0,0,0,0,0,0,1,142,17,0,...,0,1,0,0,0,0,0,0,0,0
A102,0,0,0,0,0,0,0,1,9,0,...,1,0,0,0,0,0,0,0,0,0


**Encode the labels - ASD: 1, non-ASD:0**

In [19]:
abundance_list = abundance_taxa.index.to_list()
y = np.array([1 if label.startswith('A') else 0 for label in abundance_list ])
x = abundance_taxa

## **Dimensionality Reduction**

**Principal Component Analysis: Feature Extraction**

In [21]:
pca = PCA(n_components=8)
pca.fit(x)
x_pca = pca.fit_transform(x)

**t-Distributed Stochastic Neighbor Embedding (t-SNE): Feature Extraction**

In [22]:
tsne = TSNE(metric = 'jaccard', perplexity=30.0)
x_tsne = tsne.fit_transform(x,y)



**Auto Encoders using Neural Networks**

In [23]:
input_layer = Input(shape=(x.shape[1],))
encoded = Dense(23, activation='relu')(input_layer)
decoded = Dense(x.shape[1], activation='softmax')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

X1, X2, Y1, Y2 = train_test_split(x, x, test_size=0.2, random_state=101)

autoencoder.fit(X1, Y1,
                epochs=100,
                batch_size=300,
                shuffle=True,
                verbose = 0,
                validation_data=(X2, Y2))

encoder = Model(input_layer, encoded)
x_ae = encoder.predict(x)




## **Classification Models**

**Create the Classification Models for evaluation**

In [43]:
elastic_net_classifier = LogisticRegression(penalty='elasticnet', solver='saga',l1_ratio=0.1, max_iter=1000)
lasso_classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=1000)

models=[]
models.append(('Random Forest: ', RandomForestClassifier()))
models.append(('Elastic Net Regression',elastic_net_classifier))
models.append(('Lasso Regression',lasso_classifier))

**Create the Feature Selection Models**

In [41]:
featureselection =[]
featureselection.append(('Original',x))
featureselection.append(('PCA',x_pca))
featureselection.append(('t-SNE',x_tsne))
featureselection.append(('Auto Encoders',x_ae))


## **Evaluate Models**

**Run the KFold cross validation**

In [42]:
#Array to hold the accuracy results
results = []
trials = []
for fname, x_values in featureselection:
  kfold = KFold(n_splits=5, random_state=7,shuffle= True)
  for name, model in models:
    result = cross_val_score(model, x_values, y, cv=kfold, scoring='accuracy')
    f1score = cross_val_score(model, x_values, y, cv=kfold, scoring='f1_weighted')
    accuracy = cross_val_score(model, x_values, y, cv=kfold, scoring='accuracy')
    precision = cross_val_score(model, x_values, y, cv=kfold, scoring='precision')
    recall = cross_val_score(model, x_values, y, cv=kfold, scoring='recall')
    results.append([fname,name,'F1-score',f1score.mean()*100, f1score.std()*100])
    results.append([fname,name,'accuracy',accuracy.mean()*100, accuracy.std()*100])
    results.append([fname,name,'precision',precision.mean()*100, precision.std()*100])
    results.append([fname,name,'recall',recall.mean()*100, recall.std()*100])
    count = 0
    for r in result:
      count+=1
      trials.append([fname,name,count,r])
    results.append([fname,name,result.mean()*100, result.std()*100])




**Export the results to a csv file**

In [None]:
results_df = pd.DataFrame(trials, columns = ['Test Name','Name','Trial','Accuracy'])
results_df.to_csv('trials.csv')
results_df = pd.DataFrame(results, columns = ['Test Name','Name','Accuracy Metric','F1 Score Mean','F1 Score SD'])
results_df.to_csv('results_df.csv')