# Indentifying Stress Factors in Rainbow Trouts - ML

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme()
from tqdm import tqdm

In [2]:
from sklearn.decomposition import KernelPCA, PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

SEED = 32

## Data - USDA
Performing basic cleanup and organization.
* Fishes belonging to the following categories: Control(con), Crowded(cro), High Salinity(hs), High temperature(ht), Low Temperature(lt) and Reused Water(rw).
* The dataset was transposed and renamed to become organized.
* The columns are the transcripts obtained from the fish. The prefix is the gene whereas, the suffix is the transcript number.

In [3]:
%%time
dataPath = './data/TPM_stress_USDA.xlsx'
columns = ['GeneExp']+[f"con{i}" for i in range(0,10)]+[f"cro{i}" for i in range(0, 6)]+[f"hs{i}" for i in range(0, 6)]+[f"ht{i}" for i in range(0, 6)]+[f"lt{i}" for i in range(0, 6)]+[f"rw{i}" for i in range(0, 6)]

tankColums = [f"con" for i in range(0,10)]+[f"cro" for i in range(0, 6)]+[f"hs" for i in range(0, 6)]+[f"ht" for i in range(0, 6)]+[f"lt" for i in range(0, 6)]+[f"rw" for i in range(0, 6)]

colDefMap = {'con':"Control", 'cro':"Crowded", "hs": "High Salinity", "ht": "High Temp", "lt": "Low Temp", 'rw': "Reu"}

# data = pd.read_excel(dataPath, nrows = 1000)
data = pd.read_excel(dataPath)

data.columns = columns
data = data.set_index('GeneExp').T

tankColums = [f"con" for i in range(0,10)]+[f"cro" for i in range(0, 6)]+[f"hs" for i in range(0, 6)]+[f"ht" for i in range(0, 6)]+[f"lt" for i in range(0, 6)]+[f"rw" for i in range(0, 6)]
data = data.assign(tank = tankColums)

print(data.shape)
data.head()

(40, 125909)
CPU times: user 21.9 s, sys: 198 ms, total: 22.1 s
Wall time: 22.2 s


GeneExp,4ebp2_1,5ntc_1,aacs_1,aadac_1,aadacl4_1,aagab_1,aakb1_1,aakb1_2,aakb1_3,aakb1_4,...,zwilch_2,zwilch_3,zwilch_4,zyg11_1,zyx_1,zyx_2,zzef1_1,zzef1_2,zzef1_3,tank
con0,25.225544,28.08045,2.732412,2.577981,1.763941,2.941886,57.036954,0.0,0.0,0.0,...,0.315159,0.506264,0.648684,20.87503,6.120552,16.849759,0.939789,0.999776,1.047765,con
con1,26.183206,27.814776,2.627333,2.488354,1.151104,3.088248,50.187026,0.0,0.0,0.0,...,0.537108,0.450512,0.495588,20.891255,6.146838,17.194558,0.9376,0.88066,1.04009,con
con2,24.462669,30.805397,2.260763,2.257356,1.38263,3.376831,51.169195,0.0,0.0,0.0,...,0.335363,0.551145,0.79369,19.433228,10.574061,13.420285,0.886052,0.829211,0.852616,con
con3,25.182863,29.473753,2.413444,2.288551,1.275375,3.316769,52.491676,0.0,0.0,0.0,...,0.550434,0.514201,0.492439,19.485679,6.131069,17.38473,0.945092,0.839348,0.902134,con
con4,26.305005,18.204322,2.807522,2.990708,0.949599,3.221152,47.15435,0.0,0.0,0.0,...,0.477111,0.442123,0.387635,19.494654,6.341428,17.291068,0.748626,0.799147,0.698106,con


In [4]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, con0 to rw5
Columns: 125909 entries, 4ebp2_1 to tank
dtypes: float64(125908), object(1)
memory usage: 38.4 MB


In [5]:
# Removing columns with mean 0
meanZeroCols = data.iloc[:,:-1].mean() == 0

meanZeroCols = meanZeroCols[meanZeroCols==True].index.tolist()
print(f"Number of columns with just 0: {len(meanZeroCols)}\nTotal Number of columns: {data.shape[1]}")
print(f"This is {(len(meanZeroCols)/data.shape[1])*100: .2f}% of all columns")
# data[meanZeroCols].mean()

Number of columns with just 0: 30013
Total Number of columns: 125909
This is  23.84% of all columns


In [6]:
data = data.loc[:, (data!=0).any(axis=0)]
data.head()

GeneExp,4ebp2_1,5ntc_1,aacs_1,aadac_1,aadacl4_1,aagab_1,aakb1_1,aakb1_6,aakb1_7,aakb1_8,...,zwilch_2,zwilch_3,zwilch_4,zyg11_1,zyx_1,zyx_2,zzef1_1,zzef1_2,zzef1_3,tank
con0,25.225544,28.08045,2.732412,2.577981,1.763941,2.941886,57.036954,0.0,0.0,5.631537,...,0.315159,0.506264,0.648684,20.87503,6.120552,16.849759,0.939789,0.999776,1.047765,con
con1,26.183206,27.814776,2.627333,2.488354,1.151104,3.088248,50.187026,0.0,0.0,6.048805,...,0.537108,0.450512,0.495588,20.891255,6.146838,17.194558,0.9376,0.88066,1.04009,con
con2,24.462669,30.805397,2.260763,2.257356,1.38263,3.376831,51.169195,4.817612,0.0,7.248526,...,0.335363,0.551145,0.79369,19.433228,10.574061,13.420285,0.886052,0.829211,0.852616,con
con3,25.182863,29.473753,2.413444,2.288551,1.275375,3.316769,52.491676,0.0,0.0,7.837344,...,0.550434,0.514201,0.492439,19.485679,6.131069,17.38473,0.945092,0.839348,0.902134,con
con4,26.305005,18.204322,2.807522,2.990708,0.949599,3.221152,47.15435,0.0,0.0,5.786782,...,0.477111,0.442123,0.387635,19.494654,6.341428,17.291068,0.748626,0.799147,0.698106,con


## Modelling Data Creation

In [7]:
%%time
dataModel = data.copy()
dataModel.replace({'tank':{'con':0, 'cro':1, "hs": 2, "ht": 3, "lt": 4, 'rw': 5}}, inplace=True)

xdata, ydata = dataModel.iloc[:,:-1], dataModel.iloc[:,-1]

sca = MinMaxScaler()
# sca = StandardScaler()

xdata_sca = sca.fit_transform(xdata)


# xtrain, xtest, ytrain, ytest = train_test_split(xdata, ydata, test_size=0.3, stratify=ydata, random_state=SEED)
# xtrain, xtest = np.array(xtrain), np.array(xtest)

xstrain, xstest, ytrain, ytest = train_test_split(xdata_sca, ydata, test_size=0.3, stratify=ydata, random_state=SEED)

  dataModel.replace({'tank':{'con':0, 'cro':1, "hs": 2, "ht": 3, "lt": 4, 'rw': 5}}, inplace=True)


## Feature Selection

In [10]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [11]:
%time
### ANOVA Test
selector = SelectKBest(f_classif, k='all')
selector.fit(xdata, ydata)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 10 µs


In [12]:
# for feature, f_value, p_value in zip(xdata.columns, selector.scores_, selector.pvalues_):
#     print(f'Feature: {feature}, F-value: {f_value}, p-value: {p_value}')

featureScoresANOVA = pd.DataFrame({"feature":xdata.columns, "Fval":selector.scores_, "Pval":selector.pvalues_})
featureScoresANOVA.sort_values(by=['Fval','Pval'],ascending=[False,True],inplace=True)

In [13]:
featureScoresANOVA.head()

Unnamed: 0,feature,Fval,Pval
65912,LOC110532373_1,831.502309,1.651911e-34
69915,LOC110535399_1,635.58093,1.519803e-32
56092,LOC110525377_2,623.75603,2.083705e-32
1854,bckdha_1,577.05717,7.698214e-32
73595,LOC110538141_1,576.369035,7.853951000000001e-32


In [15]:
featureScoresANOVA[featureScoresANOVA['Pval']<0.05].head()

Unnamed: 0,feature,Fval,Pval
65912,LOC110532373_1,831.502309,1.651911e-34
69915,LOC110535399_1,635.58093,1.519803e-32
56092,LOC110525377_2,623.75603,2.083705e-32
1854,bckdha_1,577.05717,7.698214e-32
73595,LOC110538141_1,576.369035,7.853951000000001e-32


In [None]:
%%time
# Mutual Information
mi = mutual_info_classif(xdata, ydata)

In [None]:
featureScoresMI = pd.DataFrame({"feature":xdata.columns, "score":mi})
featureScoresMI.sort_values(by='score', ascending=False, inplace=True)

### Common Transcripts/Features
Features common between both feature selections

In [None]:
fSelect1_ = featureScoresANOVA[featureScoresANOVA['Pval']<0.05]['feature']
fSelect2_ = featureScoresMI[featureScoresMI.score>0.25]['feature']

In [None]:
%%time
commonFeatures = list(set(fSelect1_) & set(fSelect2_))
print(f"{len(commonFeatures)*100/featureScoresMI.shape[0]:.2f}% of previous column set")

### Storing the common features