# Rainbow Trout Meat Quality Classification using Genomic Data 

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import json

%autosave 30

Autosaving every 30 seconds


In [2]:
import warnings
warnings.filterwarnings("ignore")

## Meta Data Creation

In [3]:
# %%time
# metadf_init = pd.read_excel("./data/GeneExpressionData/Meta_data.xlsx", usecols=['genetic line','Orso_Id'])
# print(f"Initial Dataframe Size:{metadf_init.shape}")

# #Dropping rows with NaN values.
# metadf_init.dropna(inplace=True)
# print(f"Dataframe Size post cleaning:{metadf_init.shape}")
# metadf_init.head()

In [4]:
# def processRows(row):
#     geneLine, fishId = row['genetic line'].split('-')[-1], row['Orso_Id'].split('_')[-1]
#     # print(geneLine, fishId)
#     return pd.Series(data = [f"F{fishId}", 'high' if geneLine=='H' else 'low'], index=['FishID', 'GeneLine'])

# metadf_init.apply(processRows, axis=1).to_csv("./metaFishData.csv", index=False)

## Meta Data

In [5]:
metadf = pd.read_csv("./metaFishData.csv")
metadf.head()

Unnamed: 0,FishID,GeneLine
0,F75,high
1,F66,low
2,F76,high
3,F72,low
4,F78,high


In [6]:
metadf['GeneLine'].value_counts(normalize=True)

GeneLine
high    0.504298
low     0.495702
Name: proportion, dtype: float64

## Metaphlan Profile

In [7]:
%%time
mp_df = pd.read_csv("./data/GeneExpressionData/merged_metaphlan_profile.tsv", sep='\t', header=0).reset_index()
print(f"Metaphlan Profile Data Shape: {mp_df.shape}")

#Generating consistent Fish Identification Marking
fishIdentifierFnx = lambda row: f'F{row.split("_")[3]}'
mp_df.iloc[0, 1:] = mp_df.iloc[0, 1:].apply(fishIdentifierFnx)

#Setting Column Names to the fish identification markings
mp_df.columns = mp_df.iloc[0]
mp_df.drop(mp_df.index[0], inplace=True)
mp_df.head()

Metaphlan Profile Data Shape: (1567, 352)
CPU times: total: 578 ms
Wall time: 589 ms


Unnamed: 0,clade_name,F100,F101,F102,F103,F104,F105,F106,F107,F108,...,F91,F92,F93,F94,F95,F96,F97,F98,F99,F9
1,k__Bacteria,100.0,99.97197,100.0,100.0,100.0,99.99576,99.98729,100.0,100.0,...,100.0,100.0,99.99293,100.0,100.0,100.0,100.0,100.0,100.0,99.99385
2,k__Bacteria|p__Proteobacteria,96.23163,1.28278,2.46132,4.19295,0.61851,0.09897,7.26275,0.26161,4.19682,...,12.66281,0.96307,18.82358,0.32609,0.02745,0.0,44.74296,0.15063,0.0,0.66998
3,k__Bacteria|p__Fusobacteria,1.90922,0.02158,96.92149,0.24615,91.75045,0.10025,0.05683,43.08036,95.27552,...,82.35353,98.30658,79.81764,36.31168,99.72513,1.57367,17.83804,98.75345,97.18932,79.67032
4,k__Bacteria|p__Firmicutes,1.85915,97.78226,0.4784,87.89138,6.77893,80.14233,92.20322,56.095,0.52766,...,4.60755,0.63897,1.25554,61.489,0.1855,98.42633,34.07137,1.09592,2.81068,18.22957
5,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,95.58687,0.0,1.11736,0.05299,0.0497,0.0,0.00083,0.0,2.79746,...,11.30934,0.37437,18.81727,0.06843,0.0,0.0,44.66672,0.15063,0.0,0.56759


## Clade Family

In [8]:
from treelib import Node, Tree

In [9]:
clade = mp_df.iloc[:, 0]

cladeFnxLambda = lambda row: len(row.split('|'))
clade.apply(cladeFnxLambda).value_counts()

clade_name
8    560
7    539
6    260
5    101
4     54
3     37
2     11
1      4
Name: count, dtype: int64

In [10]:
# cladeFnxLambda = lambda row: row.split('|')
# clade.apply(cladeFnxLambda).value_counts()

In [23]:
%%time
tree = Tree()
tree.create_node('Main', 'main')
def cladeFnxTree(row):
    row = row.split('|')
    if len(row)==1:
        tree.create_node(row[0], row[0], parent='main')
    else:
        tree.create_node(row[-1], row[-1], parent=row[-2])
             
_ = clade.apply(cladeFnxTree)

print("Depth of tree:", tree.depth())
# tree.show()      

Depth of tree: 8
CPU times: total: 31.2 ms
Wall time: 21.5 ms


In [60]:
treeFamily = json.loads(tree.to_json(with_data=False))

In [64]:
familyMap = {f'level{r}':[] for r in range(0,8)}

def cladeFnx(row):
    row = row.split('|')
    i = 0
    for rw in row:
        familyMap[f'level{i}'].append(rw)
        i+=1
    while i<8:
        familyMap[f'level{i}'].append(None)
        i+=1

_ = clade.apply(cladeFnx)

In [65]:
print(pd.DataFrame(familyMap).shape, mp_df.shape)
# pd.concat([mp_df.reset_index(drop=True), pd.DataFrame(familyMap).reset_index(drop=True)], axis=1, ignore_index=True)

(1566, 8) (1566, 352)


In [66]:
for i in familyMap:
    familyMap[i] = set(familyMap[i])
# hmap

familyMapRev = {}
for i in familyMap:
    for j in familyMap[i]:
        familyMapRev[j] = i
# hmapRev

## Initial Exploration

In [125]:
mp_fin = mp_df.set_index('clade_name').T
mp_fin = mp_fin.apply(pd.to_numeric, downcast='float')

mp_fin = pd.merge(left=mp_fin, right=metadf, left_index=True, right_on='FishID', how='outer').drop('FishID', axis=1)
print(mp_fin.shape)

(351, 1567)


In [126]:
mp_fin.head()

Unnamed: 0,k__Bacteria,k__Bacteria|p__Proteobacteria,k__Bacteria|p__Fusobacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria,k__Bacteria|p__Fusobacteria|c__Fusobacteriia,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Clostridia,k__Bacteria|p__Proteobacteria|c__Betaproteobacteria,k__Bacteria|p__Firmicutes|c__Tissierellia,...,k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Micrococcales|f__Micrococcaceae|g__Kocuria|s__Kocuria_rhizophila,k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Micrococcales|f__Micrococcaceae|g__Kocuria|s__Kocuria_rhizophila|t__SGB16671,k__Eukaryota|p__Basidiomycota,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales|f__Malasseziaceae,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales|f__Malasseziaceae|g__Malassezia,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales|f__Malasseziaceae|g__Malassezia|s__Malassezia_restricta,k__Eukaryota|p__Basidiomycota|c__Malasseziomycetes|o__Malasseziales|f__Malasseziaceae|g__Malassezia|s__Malassezia_restricta|t__EUK76775,GeneLine
57.0,100.0,0.31231,99.515503,0.10813,0.03561,99.515503,0.03217,0.07596,0.27646,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high
149.0,100.0,11.63874,0.0,87.405212,11.55073,0.0,74.730431,12.64867,0.08801,0.0261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high
181.0,100.0,96.231628,1.90922,1.85915,95.586868,1.90922,1.11486,0.74234,0.64476,0.00196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high
209.0,99.97197,1.28278,0.02158,97.782257,0.0,0.02158,68.893913,27.90214,1.28278,0.60573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high
246.0,100.0,2.46132,96.921494,0.4784,1.11736,96.921494,0.32089,0.15752,1.34395,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high


### Outlier Analysis

In [127]:
# Number of outliers in each column
def outliersFnx(data, m = 2.):
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d/mdev if mdev else np.zeros(len(d))
    return len(data[s>m])

In [128]:
fnx = lambda row: type(row.values)
fnx = lambda row: outliersFnx(row.values)
outlierCount = mp_fin.apply(fnx, axis=0).values

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
OUTLIER_CUTOFF = 40
plt.figure(figsize=(10,5), dpi = 100)
sns.histplot(data=outlierCount[outlierCount!=0]*100/mp_df.shape[1], bins=30)
plt.axvline(x=OUTLIER_CUTOFF, color='black', linestyle='--', label='40% Cutoff')
plt.legend()
plt.title("Histogram of % of statistical outliers in each column.")
plt.xlabel("% of statistical outliers.")
plt.show()

The dashed vertical lines represents the cutoff mark, beyond which there are features with more than the cutoff statistical outlier values.

In [None]:
var_ = outlierCount*100/mp_df.shape[1]
len(var_[var_>40])

print(f"{len(var_[var_>40])} out of {len(var_)} features have more than {OUTLIER_CUTOFF}% datapoints as statistical outliers.")

In [None]:
hmap = {'feature':[], 'outlierCount':[], 'outlierCount%':[], 'level':[]}
for i,j  in zip(mp_fin.columns, outlierCount):
    hmap['feature'].append(i)
    hmap['outlierCount'].append(j)
    hmap['outlierCount%'].append(j*100/mp_df.shape[1])
    hmap['level'].append(len(i.split('|')))
    
outlierDf = pd.DataFrame(hmap)

In [None]:
outlierDf[outlierDf['outlierCount%']>OUTLIER_CUTOFF].head()

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(12,4), sharey=False)
outlierDf['level'].value_counts().plot(kind='bar', ax=ax[0])
outlierDf[outlierDf['outlierCount%']>OUTLIER_CUTOFF]['level'].value_counts().plot(kind='bar', ax=ax[1])

ax[0].set_title('Level Distribution')
ax[1].set_title('Level Distribution in Outliers')
plt.show()

In [None]:
mp_fin

In [None]:
metadf
