In [2]:
# +-----------------------------------------------------------------------------+
# | MasterTable Script Step 2, Written by Python
# | Post-Filtering : over 1 at least 1 sample
# | Post-Filtering Method 1 : Standard deviation
# | Post-Filtering Mehotd 2 : VMR
# | maintainer: jinhokim.07@cau.ac.kr
# | Last Update: 24.02.20
# +-----------------------------------------------------------------------------+

# +---System Library Import-----------------------------------------------------+
import os, sys
# +---Basic Library Import------------------------------------------------------+
import numpy as np
import pandas as pd
# +---Graph Library Import------------------------------------------------------+
import matplotlib.pyplot as plt
import seaborn as sns

# Function

In [106]:
def TableLoad(route):
    Data = pd.read_csv(route)
    try: # Unnamed remove
        Data = Data.drop(["Unnamed: 0"], axis = 1)
    except:
        pass
    # Data Drop -> Prefiltered Data (MasterTable)
    Data_DropNA = Data.dropna().copy().reset_index(drop=True)
    # Data_DropNA.head()
    return Data_DropNA

In [107]:
def GroupInfo_Checker(Data, group_info):
    flag = True
    for group in group_info:
        len_group = len(group)
        
        try:
            len_Data  = Data.loc[:,group].shape[1]
            if(len_group != len_Data):
                print(group, "Total length of Index is wrong")
                flag = False
               
        except KeyError:
            print("Index is Wrong", group)
            flag = False
        
    if (flag): print("All columns matched")
    return None

In [108]:
def FPKMoverx_atleastonesample(Data, groupinfo_f, thres = 1):
    maxfpkm = Data.loc[:,groupinfo_f].max(axis = 1)
    Dataoverx = Data.loc[maxfpkm >= thres].copy().reset_index(drop=True)
    return Dataoverx

In [109]:
def FilteringCondition(Data, groupinfo, groupname):
    
    Data_Added = Data.copy()
    if (len(group_info) != len(group_name)):
        print("Check your `group_info` and `group_name`")
        print("Length of these list are not matched!")
        
    for g_list, g_name in zip(group_info, group_name):
        mean_ = Data.loc[:, g_list].mean(axis = 1)
        var_  = Data.loc[:, g_list].var(axis = 1)
        std_  = Data.loc[:, g_list].std(axis = 1)
        sigma_ = std_/mean_ * 100
        vmr_ = var_/mean_
        Data_Added[g_name+"_std_%"] = sigma_
        Data_Added[g_name+"_VMR"] = vmr_
    
    # print(Data_Added.head())
    
    return Data_Added

# FilteringCondition(Data, group_info, group_name)

# Calculation Part

In [119]:
route = "./300_MasterTable/MasterTable_DESeq2_Default_Raw.csv"

# For Transcript
res_route = route.replace("_Raw.csv","_Data.xlsx")
# For Gene
# res_route = route.replace("_Raw_Gene.csv","_Data_Gene.xlsx")
print(route)
print(res_route)

# Table Load
Data = TableLoad(route)
print(Data.columns)

./300_MasterTable/MasterTable_DESeq2_Default_Raw.csv
./300_MasterTable/MasterTable_DESeq2_Default_Data.xlsx
Index(['TransIDs', 'TransName', 'TransBioType', 'Chr', 'Start', 'End',
       'Strand', 'GeneIDs', 'GeneNames', 'GeneBioType', 'WT_S1', 'WT_S2',
       'Smug1_KO_S1', 'Smug1_KO_S2', 'log2FC_DESeq2', 'Pvalue_DESeq2',
       'Padj_DESeq2'],
      dtype='object')


In [111]:
# Group Infomation : 2D array
# refer to print(Data.column)
group_info = [["WT_S1","WT_S2"], ["Smug1_KO_S1", "Smug1_KO_S2"]]
group_name = ["WT","Smug1_KO"]
group_info_flatten = [y for x in group_info for y in x]
GroupInfo_Checker(Data, group_info)

All columns matched


In [116]:
thres = 1
Data_Filtering = FilteringCondition(Data, group_info, group_name)
Data_Filtering_FPKMoverX = FPKMoverx_atleastonesample(Data_Filtering, group_info_flatten, thres)

In [117]:
print(Data.shape, Data_Filtering.shape, Data_Filtering_FPKMoverX.shape)

(49705, 17) (49705, 21) (39744, 21)


In [120]:
BI_res_route = res_route.replace("_Data.xlsx","_Data_ov"+str(thres)+"_BI.csv")
Data_Filtering_FPKMoverX.to_csv(BI_res_route, index= False)

In [121]:
with pd.ExcelWriter(res_route) as writer:
    Data_Filtering.to_excel(writer, sheet_name="Raw", index = False)
    Data_Filtering.to_excel(writer, sheet_name="FPKMover"+str(thres), index = False)