# Indentifying Stress Factors in Rainbow Trouts - ML

In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme()
from tqdm import tqdm

In [1]:
from sklearn.decomposition import KernelPCA, PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb


SEED = 32

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Common Features

In [6]:
commonFeatures = []
with open("./primaryColumnList.txt", "r") as f:
  for line in f:
    commonFeatures.append(str(line.strip()))

targetNames = ['control','crowded', 'high salinity', 'high temp', 'low temp', 'resued water']

## USDA Data

In [4]:
%%time
dataPath = './data/TPM_stress_USDA.xlsx'
columns = ['GeneExp']+[f"con{i}" for i in range(0,10)]+[f"cro{i}" for i in range(0, 6)]+[f"hs{i}" for i in range(0, 6)]+[f"ht{i}" for i in range(0, 6)]+[f"lt{i}" for i in range(0, 6)]+[f"rw{i}" for i in range(0, 6)]

tankColums = [f"con" for i in range(0,10)]+[f"cro" for i in range(0, 6)]+[f"hs" for i in range(0, 6)]+[f"ht" for i in range(0, 6)]+[f"lt" for i in range(0, 6)]+[f"rw" for i in range(0, 6)]

colDefMap = {'con':"Control", 'cro':"Crowded", "hs": "High Salinity", "ht": "High Temp", "lt": "Low Temp", 'rw': "Reu"}

# data = pd.read_excel(dataPath, nrows = 1000)
dataUSDA = pd.read_excel(dataPath)

dataUSDA.columns = columns
dataUSDA = dataUSDA.set_index('GeneExp').T

tankColums = [f"con" for i in range(0,10)]+[f"cro" for i in range(0, 6)]+[f"hs" for i in range(0, 6)]+[f"ht" for i in range(0, 6)]+[f"lt" for i in range(0, 6)]+[f"rw" for i in range(0, 6)]
dataUSDA = dataUSDA.assign(tank = tankColums)

print(dataUSDA.shape)
dataUSDA.head()

(40, 125909)
CPU times: user 21 s, sys: 275 ms, total: 21.3 s
Wall time: 21.4 s


GeneExp,4ebp2_1,5ntc_1,aacs_1,aadac_1,aadacl4_1,aagab_1,aakb1_1,aakb1_2,aakb1_3,aakb1_4,...,zwilch_2,zwilch_3,zwilch_4,zyg11_1,zyx_1,zyx_2,zzef1_1,zzef1_2,zzef1_3,tank
con0,25.225544,28.08045,2.732412,2.577981,1.763941,2.941886,57.036954,0.0,0.0,0.0,...,0.315159,0.506264,0.648684,20.87503,6.120552,16.849759,0.939789,0.999776,1.047765,con
con1,26.183206,27.814776,2.627333,2.488354,1.151104,3.088248,50.187026,0.0,0.0,0.0,...,0.537108,0.450512,0.495588,20.891255,6.146838,17.194558,0.9376,0.88066,1.04009,con
con2,24.462669,30.805397,2.260763,2.257356,1.38263,3.376831,51.169195,0.0,0.0,0.0,...,0.335363,0.551145,0.79369,19.433228,10.574061,13.420285,0.886052,0.829211,0.852616,con
con3,25.182863,29.473753,2.413444,2.288551,1.275375,3.316769,52.491676,0.0,0.0,0.0,...,0.550434,0.514201,0.492439,19.485679,6.131069,17.38473,0.945092,0.839348,0.902134,con
con4,26.305005,18.204322,2.807522,2.990708,0.949599,3.221152,47.15435,0.0,0.0,0.0,...,0.477111,0.442123,0.387635,19.494654,6.341428,17.291068,0.748626,0.799147,0.698106,con


## UMD Data

In [7]:
%%time
dataPath = './data/TPM_stress_RedTrout.xlsx'

# data = pd.read_excel(dataPath, nrows = 100)
dataUMD = pd.read_excel(dataPath)

colsTemp = ['idx']+['CTRL']*(dataUMD.shape[1]//2)+['HEAT']*(dataUMD.shape[1]//2)

dataUMD.columns = colsTemp
dataUMD.iloc[0,0] = 'DAY'

dataUMD = dataUMD.set_index("idx").T.reset_index(drop=False, names = 'GeneExp')#.set_index("DAY", append=True)
dataUMD.rename(columns={'GeneExp':'Group'}, inplace=True)
print(dataUMD.shape)

(72, 125910)
CPU times: user 38.9 s, sys: 434 ms, total: 39.4 s
Wall time: 39.5 s


## Primary Features

In [8]:
%%time
umdDataFeatures = dataUMD.columns.tolist()

commonFeaturesUMD = list(set(umdDataFeatures) & set(commonFeatures))

print(f"{len(commonFeatures)*100/len(umdDataFeatures):.2f}% of UMD data feature set")
print(f"{len(commonFeaturesUMD)*100/len(commonFeatures):.2f}% of the training data feature set (USDA)")

54.86% of UMD data feature set
71.48% of the training data feature set (USDA)
CPU times: user 9.75 ms, sys: 913 µs, total: 10.7 ms
Wall time: 10.3 ms


## Merging UMD & USDA data

In [35]:
dataModelUSDA = dataUSDA.loc[:,commonFeaturesUMD+['tank']].copy().reset_index(drop=True)
dataModelUSDA = dataModelUSDA[(dataModelUSDA['tank'] == 'con') | (dataModelUSDA['tank'] == 'ht')]
dataModelUSDA.loc[:,:] = dataModelUSDA.replace({'tank':{'con':0, 'cro':3, "hs": 2, "ht": 1, "lt": 4, 'rw': 5}})

dataModelUSDA = dataModelUSDA.assign(src='usda')
dataModelUSDA.head()

GeneExp,LOC110531840_3,LOC110492579_3,wwox_2,LOC110493244_4,LOC110515989_3,LOC110528812_2,sec16a_4,LOC110489558_6,LOC118938868_3,cry5_2,...,LOC110493284_3,eogt_1,LOC110491446_2,LOC110528038_2,LOC110501500_2,cep135_4,kif14_1,LOC110520020_1,tank,src
0,0.009107,0.26377,0.52635,0.006604,0.352133,2.592352,1.623573,1.357169,0.720504,1.476578,...,0.0,3.918445,0.428181,0.587794,0.060196,1.05814,0.649533,17.932904,0,usda
1,0.0109,0.20536,0.258329,0.005015,0.335977,3.058832,1.300134,1.695033,0.779129,2.040519,...,0.004263,6.256455,0.42782,0.587299,0.109082,1.252895,0.627884,18.246459,0,usda
2,0.009932,0.250269,0.265661,0.00773,0.365347,2.634441,1.770276,1.493038,0.785411,1.888128,...,0.011266,8.21208,0.354227,0.605685,0.082357,1.240397,0.609699,17.887609,0,usda
3,0.008507,0.286528,0.535958,0.009822,0.357309,2.544482,2.023899,1.517751,0.761161,1.812235,...,0.020413,6.754496,0.33519,0.566653,0.058785,1.198861,0.609159,18.161927,0,usda
4,0.006594,0.207625,0.295105,0.009101,0.304879,3.520011,1.858197,1.359425,0.638931,1.546159,...,0.015475,4.932302,0.4003,0.515175,0.075418,0.651463,0.41188,18.94793,0,usda


In [36]:
dataModelUSDA.shape

(16, 49380)

In [38]:
dataModelUMD = dataUMD.loc[:, commonFeaturesUMD+['Group']]
dataModelUMD.rename(columns={'Group':'tank'},inplace=True)
dataModelUMD.loc[:,:] = dataModelUMD.replace({'tank':{'CTRL':0, "HEAT": 1}})

dataModelUMD = dataModelUMD.assign(src='umd')
dataModelUMD.head()

idx,LOC110531840_3,LOC110492579_3,wwox_2,LOC110493244_4,LOC110515989_3,LOC110528812_2,sec16a_4,LOC110489558_6,LOC118938868_3,cry5_2,...,LOC110493284_3,eogt_1,LOC110491446_2,LOC110528038_2,LOC110501500_2,cep135_4,kif14_1,LOC110520020_1,tank,src
0,0.13719,0.164517,0.200974,0.035393,0.232246,0.877287,0.379126,0.820367,0.244407,1.371634,...,0.012897,0.0,0.26745,0.266478,0.031425,0.918757,1.06787,10.96663,0,umd
1,0.089971,0.226141,0.814022,0.029113,0.450137,1.165705,1.371762,1.072768,0.487138,1.620064,...,0.045697,0.0,0.392315,0.362828,0.064954,0.758381,0.256761,4.578807,0,umd
2,0.173338,0.206546,0.222644,0.051775,0.355634,0.343114,0.455659,1.111175,0.208559,1.560601,...,0.068475,0.0,0.739471,0.235814,0.023836,1.35418,1.119524,14.519248,0,umd
3,0.099633,0.296542,0.352688,0.025264,0.479214,0.674383,1.412045,1.42642,0.355628,1.182199,...,0.026437,1.985553,0.32939,0.390723,0.032209,0.979135,0.820886,8.147229,0,umd
4,0.237129,0.14911,0.246166,0.033225,0.369731,0.720586,0.155947,0.827139,0.331399,1.365615,...,0.043941,0.0,0.296053,0.198162,0.07648,0.579326,0.573613,2.799618,0,umd


In [40]:
dataModel = pd.concat([dataModelUMD, dataModelUSDA],axis=0).sample(frac=1)
print(dataModel.shape)
dataModel.head()

(88, 49380)


Unnamed: 0,LOC110531840_3,LOC110492579_3,wwox_2,LOC110493244_4,LOC110515989_3,LOC110528812_2,sec16a_4,LOC110489558_6,LOC118938868_3,cry5_2,...,LOC110493284_3,eogt_1,LOC110491446_2,LOC110528038_2,LOC110501500_2,cep135_4,kif14_1,LOC110520020_1,tank,src
58,0.11989,0.093016,0.207864,0.04389,0.232419,1.374965,0.983683,0.923106,0.252565,1.202657,...,0.074631,0.0,0.343935,0.247495,0.050515,0.986826,0.77246,5.710194,1,umd
60,0.27209,0.213831,0.690602,0.138399,0.209995,0.639697,0.920101,0.718707,0.344681,1.641291,...,0.050154,0.089156,0.611668,0.287865,0.037603,2.604784,0.930975,3.620193,1,umd
26,0.105121,0.130328,0.245252,0.021511,0.375904,0.863086,0.976842,0.914076,0.304509,0.919513,...,0.051208,0.0,0.22267,0.256128,0.008913,0.763971,0.68146,6.681864,0,umd
71,1.256947,1.664544,3.709979,0.576943,2.512485,0.119171,1.172185,3.863222,1.783335,4.787947,...,1.803262,3.724618,1.518848,0.761508,0.0,2.768389,3.08356,4.467054,1,umd
36,0.179249,0.230814,0.319411,0.035358,0.330813,1.150286,0.970871,0.894049,0.266359,1.585424,...,0.016397,0.15789,0.332736,0.219601,0.0,1.008862,0.872836,5.460409,1,umd


In [41]:
dataModel['src'].value_counts()

src
umd     72
usda    16
Name: count, dtype: int64

# Models

In [49]:
xdata, ydata = dataModel.iloc[:,:-2], dataModel.iloc[:,-2]
ydata = np.array(ydata.astype(int))

sca = StandardScaler()

xdata_sca = sca.fit_transform(xdata)

## Logistic Regression

## Recusrive Feature Elimination 