In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import StratifiedShuffleSplit

# Importing Data

Below the Dataset 7 and Dataset 8 are downloaded. The information for the chemicals that were used to treat the 3D Neural constructs are also downloaded. Dataset 7 represents Normalized gene expression (TPM) for day 16 neural constructs after dosing with toxic or non-toxic chemicals (two days of exposure). Dataset 8 represents Normalized gene expression (TPM) for day 21 neural constructs after dosing with toxic or non-toxic chemicals (seven days of exposure). 3D Neural constructs were grown for 14 days before chemical exposure.

In [19]:
day_21_df = pd.read_csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/Day_21_RNA_seq.csv')
day_21_df

Unnamed: 0,Gene.ID,Transcript_ID,day 21 b1a,day 21 b1b,day 21 b2a,day 21 b2b,day 21 b3a,day 21 b3b,day 21 b4a,day 21 b4b,...,day 21 t30a,day 21 t30b,day 21 t31a,day 21 t31b,day 21 t32a,day 21 t32b,day 21 t33a,day 21 t33b,day 21 t34a,day 21 t34b
0,A1BG,NM_130786.1,8.21,7.16,19.99,20.77,14.20,9.60,12.10,12.71,...,13.54,11.81,7.19,11.88,9.75,7.73,17.22,15.17,20.44,10.83
1,A1CF,"NM_001198818.1,NM_001198819.1,NM_001198820.1,N...",0.00,0.00,0.00,0.00,0.16,0.14,0.00,0.00,...,0.04,0.09,0.32,0.07,0.00,0.03,0.05,0.09,0.00,0.05
2,A2LD1,"NM_001195087.1,NM_033110.1",1.65,3.56,3.17,1.49,3.30,6.32,5.98,4.55,...,3.72,2.39,3.06,4.17,2.59,3.65,3.53,5.18,3.04,3.00
3,A2M,NM_000014.1,314.44,321.66,77.28,80.94,140.52,146.63,169.19,93.49,...,147.16,152.41,191.36,213.36,127.01,169.79,160.04,153.30,262.29,288.09
4,A2ML1,NM_144670.1,0.55,0.24,0.25,0.00,0.15,0.15,0.10,0.43,...,0.00,0.18,0.14,0.00,0.14,0.12,0.38,0.38,0.33,0.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19079,ZYG11A,NM_001004339.1,1.12,0.48,0.15,0.31,0.22,1.49,1.60,1.71,...,0.93,0.31,1.22,0.64,0.74,0.65,0.76,0.64,1.76,1.46
19080,ZYG11B,NM_024646.1,42.05,42.98,42.33,42.06,38.06,41.21,34.66,38.96,...,44.10,44.67,49.90,45.38,25.03,50.70,26.39,37.47,25.52,34.48
19081,ZYX,"NM_001010972.1,NM_003461.1",79.69,55.43,87.20,74.56,81.82,74.56,78.07,72.44,...,89.29,80.73,87.67,65.17,99.46,72.56,107.69,76.56,96.67,82.75
19082,ZZEF1,NM_015113.1,17.13,19.52,20.60,17.02,16.28,17.34,21.52,18.34,...,19.85,20.14,23.31,19.60,11.73,17.27,12.38,15.62,17.18,19.80


In [21]:
day_16_df = pd.read_csv('/Users/danielaquijano/Documents/GitHub/Machine-Learning-Course-Projects/Final_Project_Neurotoxicity_Prediction/Final_Project_Files/Day_16_RNA_seq.csv')
day_16_df

Unnamed: 0,Gene.ID,Transcript_ID,day 16 b1a,day 16 b1b,day 16 b2a,day 16 b2b,day 16 b3a,day 16 b3b,day 16 b4a,day 16 b4b,...,day 16 t30a,day 16 t30b,day 16 t31a,day 16 t31b,day 16 t32a,day 16 t32b,day 16 t33a,day 16 t33b,day 16 t34a,day 16 t34b
0,A1BG,NM_130786.1,9.57,9.83,15.34,14.42,10.43,8.29,11.39,13.67,...,13.81,12.15,14.24,11.22,14.95,10.58,9.75,12.41,16.38,10.78
1,A1CF,"NM_001198818.1,NM_001198819.1,NM_001198820.1,N...",0.00,0.04,0.13,0.00,0.27,0.00,0.08,0.00,...,0.27,0.00,0.00,0.22,0.00,0.19,0.00,0.19,0.00,0.19
2,A2LD1,"NM_001195087.1,NM_033110.1",4.68,4.46,1.61,3.24,2.70,2.98,3.06,1.39,...,3.37,2.12,2.20,2.96,3.54,1.85,1.95,3.50,3.85,3.55
3,A2M,NM_000014.1,368.93,350.46,171.77,119.16,235.31,292.92,184.61,172.11,...,277.85,212.12,242.60,271.34,221.18,227.61,204.09,173.85,230.21,323.13
4,A2ML1,NM_144670.1,0.00,0.00,0.17,0.00,0.22,0.00,0.25,0.13,...,0.00,0.00,0.00,0.25,0.15,0.31,0.00,0.00,0.39,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19079,ZYG11A,NM_001004339.1,1.55,0.79,0.92,0.00,0.61,0.02,1.99,0.94,...,0.60,1.75,1.88,0.33,2.18,1.04,0.52,1.82,0.63,1.12
19080,ZYG11B,NM_024646.1,42.04,45.03,48.64,46.11,40.49,40.76,42.74,54.46,...,37.64,45.79,41.93,42.36,49.19,49.05,49.76,47.96,48.74,39.63
19081,ZYX,"NM_001010972.1,NM_003461.1",73.61,80.59,86.60,82.82,81.22,73.89,67.85,77.99,...,57.65,73.23,83.54,81.58,77.96,88.30,72.33,85.84,76.73,106.11
19082,ZZEF1,NM_015113.1,17.65,18.70,23.36,21.27,19.93,20.91,16.70,21.25,...,17.11,22.49,20.04,20.57,20.57,19.30,19.94,19.31,28.66,21.23


### Data Clean-up

Next, combine day_16_df and day_21_df into a single dataframe that reflects the averge values of gene expression between the the two dataframes

In [None]:
for i in range(len(raw_data)):
    for j in range(1, len(raw_data.columns)):
        raw_data_normalized.iat[i,j]=(int(raw_data.iat[i,j])*1000000)/int(raw_cols_sums[j])


Log-2 fold change 

Visualize Normalization with boxplot

Visualize Log-2fold change

Add labels of toxic/nontoxic

### Exploratory Data Analysis

Heat map of genes with log2 fold change above a specific thershold

### PCA

### Unsupervised Clustering 

In [None]:
Multidimensional Scaling


### GO term enrichment

### Differentially Expressed Genes

### Obtain Top 100 Differentially Expressed Genes

In [None]:
Visualizing Top 100 Differentially Expressed Gnes

# Implementation of Classification Algorithms

In [None]:
Support Vector Machines