# Rainbow Trout Meat Quality Classification using Genomic Data 

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

%autosave 30

Autosaving every 30 seconds


In [12]:
import warnings
warnings.filterwarnings("ignore")

## Meta Data Creation

In [2]:
# %%time
# metadf_init = pd.read_excel("./data/GeneExpressionData/Meta_data.xlsx", usecols=['genetic line','Orso_Id'])
# print(f"Initial Dataframe Size:{metadf_init.shape}")

# #Dropping rows with NaN values.
# metadf_init.dropna(inplace=True)
# print(f"Dataframe Size post cleaning:{metadf_init.shape}")
# metadf_init.head()

In [3]:
# def processRows(row):
#     geneLine, fishId = row['genetic line'].split('-')[-1], row['Orso_Id'].split('_')[-1]
#     # print(geneLine, fishId)
#     return pd.Series(data = [f"F{fishId}", 'high' if geneLine=='H' else 'low'], index=['FishID', 'GeneLine'])

# metadf_init.apply(processRows, axis=1).to_csv("./metaFishData.csv", index=False)

## Meta Data

In [4]:
metadf = pd.read_csv("./metaFishData.csv")
metadf.head()

Unnamed: 0,FishID,GeneLine
0,F75,high
1,F66,low
2,F76,high
3,F72,low
4,F78,high


In [5]:
metadf['GeneLine'].value_counts(normalize=True)

GeneLine
high    0.504298
low     0.495702
Name: proportion, dtype: float64

## Metaphlan Profile

In [39]:
%%time
mp_df = pd.read_csv("./data/GeneExpressionData/merged_metaphlan_profile.tsv", sep='\t', header=0).reset_index()
print(f"Metaphlan Profile Data Shape: {mp_df.shape}")

#Generating consistent Fish Identification Marking
fishIdentifierFnx = lambda row: f'F{row.split("_")[3]}'
mp_df.iloc[0, 1:] = mp_df.iloc[0, 1:].apply(fishIdentifierFnx)

#Setting Column Names to the fish identification markings
mp_df.columns = mp_df.iloc[0]
mp_df.drop(mp_df.index[0], inplace=True)
mp_df.head()

Metaphlan Profile Data Shape: (1567, 352)
CPU times: total: 609 ms
Wall time: 604 ms


Unnamed: 0,clade_name,F100,F101,F102,F103,F104,F105,F106,F107,F108,...,F91,F92,F93,F94,F95,F96,F97,F98,F99,F9
1,k__Bacteria,100.0,99.97197,100.0,100.0,100.0,99.99576,99.98729,100.0,100.0,...,100.0,100.0,99.99293,100.0,100.0,100.0,100.0,100.0,100.0,99.99385
2,k__Bacteria|p__Proteobacteria,96.23163,1.28278,2.46132,4.19295,0.61851,0.09897,7.26275,0.26161,4.19682,...,12.66281,0.96307,18.82358,0.32609,0.02745,0.0,44.74296,0.15063,0.0,0.66998
3,k__Bacteria|p__Fusobacteria,1.90922,0.02158,96.92149,0.24615,91.75045,0.10025,0.05683,43.08036,95.27552,...,82.35353,98.30658,79.81764,36.31168,99.72513,1.57367,17.83804,98.75345,97.18932,79.67032
4,k__Bacteria|p__Firmicutes,1.85915,97.78226,0.4784,87.89138,6.77893,80.14233,92.20322,56.095,0.52766,...,4.60755,0.63897,1.25554,61.489,0.1855,98.42633,34.07137,1.09592,2.81068,18.22957
5,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,95.58687,0.0,1.11736,0.05299,0.0497,0.0,0.00083,0.0,2.79746,...,11.30934,0.37437,18.81727,0.06843,0.0,0.0,44.66672,0.15063,0.0,0.56759


## Clade Family

In [51]:
from treelib import Node, Tree

In [102]:
clade = mp_df.iloc[:, 0]

cladeFnxLambda = lambda row: len(row.split('|'))
clade.apply(cladeFnxLambda).value_counts()

clade_name
8    560
7    539
6    260
5    101
4     54
3     37
2     11
1      4
Name: count, dtype: int64

In [103]:
# cladeFnxLambda = lambda row: row.split('|')
# clade.apply(cladeFnxLambda).value_counts()

In [104]:
%%time
tree = Tree()
tree.create_node('Main', 'main')
def cladeFnxTree(row):
    row = row.split('|')
    if len(row)==1:
        tree.create_node(row[0], row[0], parent='main')
    else:
        tree.create_node(row[-1], row[-1], parent=row[-2])
             
_ = clade.apply(cladeFnxTree)

# tree.show()      

CPU times: total: 15.6 ms
Wall time: 14.3 ms


In [110]:
hmap = {f'level{r}':[] for r in range(0,8)}

def cladeFnx(row):
    row = row.split('|')
    i = 0
    for rw in row:
        hmap[f'level{i}'].append(rw)
        i+=1
    while i<8:
        hmap[f'level{i}'].append(None)
        i+=1

_ = clade.apply(cladeFnx)

In [115]:
pd.DataFrame(hmap).shape, mp_df.shape

((1566, 8), (1566, 352))

In [114]:


pd.concat([mp_df, pd.DataFrame(hmap)], axis=1)

Unnamed: 0,clade_name,F100,F101,F102,F103,F104,F105,F106,F107,F108,...,F99,F9,level0,level1,level2,level3,level4,level5,level6,level7
1,k__Bacteria,100.0,99.97197,100.0,100.0,100.0,99.99576,99.98729,100.0,100.0,...,100.0,99.99385,k__Bacteria,p__Proteobacteria,,,,,,
2,k__Bacteria|p__Proteobacteria,96.23163,1.28278,2.46132,4.19295,0.61851,0.09897,7.26275,0.26161,4.19682,...,0.0,0.66998,k__Bacteria,p__Fusobacteria,,,,,,
3,k__Bacteria|p__Fusobacteria,1.90922,0.02158,96.92149,0.24615,91.75045,0.10025,0.05683,43.08036,95.27552,...,97.18932,79.67032,k__Bacteria,p__Firmicutes,,,,,,
4,k__Bacteria|p__Firmicutes,1.85915,97.78226,0.4784,87.89138,6.77893,80.14233,92.20322,56.095,0.52766,...,2.81068,18.22957,k__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,,,,,
5,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,95.58687,0.0,1.11736,0.05299,0.0497,0.0,0.00083,0.0,2.79746,...,0.0,0.56759,k__Bacteria,p__Fusobacteria,c__Fusobacteriia,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1563,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,k__Eukaryota,p__Basidiomycota,c__Malasseziomycetes,o__Malasseziales,f__Malasseziaceae,g__Malassezia,,
1564,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,k__Eukaryota,p__Basidiomycota,c__Malasseziomycetes,o__Malasseziales,f__Malasseziaceae,g__Malassezia,s__Malassezia_restricta,
1565,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,k__Eukaryota,p__Basidiomycota,c__Malasseziomycetes,o__Malasseziales,f__Malasseziaceae,g__Malassezia,s__Malassezia_restricta,t__EUK76775
1566,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,


In [109]:
mp_df

Unnamed: 0,clade_name,F100,F101,F102,F103,F104,F105,F106,F107,F108,...,F91,F92,F93,F94,F95,F96,F97,F98,F99,F9
1,k__Bacteria,100.0,99.97197,100.0,100.0,100.0,99.99576,99.98729,100.0,100.0,...,100.0,100.0,99.99293,100.0,100.0,100.0,100.0,100.0,100.0,99.99385
2,k__Bacteria|p__Proteobacteria,96.23163,1.28278,2.46132,4.19295,0.61851,0.09897,7.26275,0.26161,4.19682,...,12.66281,0.96307,18.82358,0.32609,0.02745,0.0,44.74296,0.15063,0.0,0.66998
3,k__Bacteria|p__Fusobacteria,1.90922,0.02158,96.92149,0.24615,91.75045,0.10025,0.05683,43.08036,95.27552,...,82.35353,98.30658,79.81764,36.31168,99.72513,1.57367,17.83804,98.75345,97.18932,79.67032
4,k__Bacteria|p__Firmicutes,1.85915,97.78226,0.4784,87.89138,6.77893,80.14233,92.20322,56.095,0.52766,...,4.60755,0.63897,1.25554,61.489,0.1855,98.42633,34.07137,1.09592,2.81068,18.22957
5,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,95.58687,0.0,1.11736,0.05299,0.0497,0.0,0.00083,0.0,2.79746,...,11.30934,0.37437,18.81727,0.06843,0.0,0.0,44.66672,0.15063,0.0,0.56759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00707,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1563,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00707,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1564,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00707,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1565,k__Eukaryota|p__Basidiomycota|c__Malasseziomyc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00707,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Initial Exploration