In [1]:
# imports
import pandas as pd
import numpy as np

# adapt the bacterial coverage table to a MAG table from a vOTU table
# you already have a viral vOTU table that is clean

# The coverage of each population genome was calculated as the average of 
# all of its binned contig coverages, weighting each contig by its length in base pairs
df_bin_len = pd.read_csv('200114_contigs_len.csv',sep='\t')

# calculate mean of each length, put info to dict and dict to df
df_total_len = pd.DataFrame(df_bin_len.sum().to_dict(),index=[df_bin_len.index.values[-1]])

# add column names in the dict dat correspond with the bin names
df_total_len.rename(columns=lambda c: str(c))

# transpose the dataframe
df_total_len = df_total_len.transpose()


# add a column with the bin names because they were the index
df_total_len ['index_col'] = df_total_len.index 

# give the columns header names
df_total_len.columns = ['total_len', 'bin_name']

# add a number index
df_total_len = df_total_len.reset_index()

#remove the original index that was bin names
del df_total_len['index']

# normalized by the number of metagenomic reads in each sample, 
# calculated as described above for the viral OTU table. Only average coverage 
# values of ≥ 0.25× were retained; lower values were converted to zero

In [7]:
# open all the lengths of the otu's (otu table)
df_otu_len = pd.read_csv('200117_map_to_bact_contigs_nofilter.tsv',sep='\t')

#keep only length and column name
df_otu_len = df_otu_len[['bact_contig','length']]

# change column headers
df_otu_len.columns = ['bact_contig', 'length']
#len(df_otu_len)

In [4]:
# # open the dataframe with all contigs matched to bins
# df_bin_otu = pd.read_csv('191112_all_contigs_inbin.csv', sep ='|')

# #  clean this file
# df_bin_otu.columns = df_bin_otu.columns.str.replace(".txt", "")
# df_bin_otu = df_bin_otu.replace({'>':''}, regex=True)



In [1]:
from datetime import datetime

results =[]
bact_contig_list = df_otu_len.contig.tolist()

# get all column names from the dataframe
columns = list(df_bin_otu.columns)

startTime = datetime.now()
ci_time = datetime.now()

for i in bact_contig_list:
    
    # apply the get column function, append the results of this functions to the list results
    results.append((i, get_column(df_bin_otu, i, columns)))
    if len(results) % 1000 == 0:
        print len(results)
        print "total_time: " + str(datetime.now() - startTime)
        print "current time: " + str(datetime.now() - ci_time)
        ci_time = datetime.now()
        

# make a dataframe from the results
df_contig_mag = pd.DataFrame.from_records(results, columns = ["bact_contig", "magbin"])



df_contig_mag.to_csv('bins_matched_to_contigDONTDOTHIS.csv', index=False) 

len(df_contig_mag)

In [66]:
# now merge all those dataframes:

# merge the 2 dataframes
#df_otu_bin_len = pd.merge(df_otu_len, df_bin_otu, on='contig', how='outer')

#df_otu_bin_len

In [10]:
# function get_column
def get_column(df, search_value, column_values):
    
    for index, row in df.iterrows():
        
        # create list from row values
        list_row = list(row)
        
        try:
            # check if search_value is found in current row
            index_value = list_row.index(search_value)
            
            # if found return associated column 
            return column_values[index_value]
        
        # if not found continue search in next row
        except ValueError:
            continue

    # return not found if value is not present in dataframe
    return "not found"

In [3]:
# now merge all those dataframes so we have contig length, avg length and bin in one df
# open the bins match to the contig file:
df_bin_otu = pd.read_csv('bins_matched_to_contig.csv', sep =',')

# rename the header for the OTU and lenght file
df_otu_len.columns = ['bact_contig', 'length']

# merge the 2 dataframes on the bacterial contig name
df_otu_bin_len = pd.merge(df_otu_len, df_bin_otu, on='bact_contig', how='outer')


In [34]:
# now merge with all the average lengths from each bin
# change mean length column names
df_total_len.columns = ['total_len_bin', 'magbin']

# clean this file
df_total_len.magbin = df_total_len.magbin.str.replace(".txt.new", "")


# merge
df_otu_bin_len_mag = pd.merge(df_otu_bin_len, df_total_len, on='magbin', how='outer')

# calculate the normalization factor for the reads (= contig_len / total_bin_len)
df_otu_bin_len_mag['bin_norm_factor'] =  (df_otu_bin_len_mag['length'] / df_otu_bin_len_mag['total_len_bin'])


# len(df_otu_bin_len_mag)

In [17]:
# now merge with the other one (the coverage table)
# open up the coverage table so we can add the avg bin lenght to it
df_covtab = pd.read_csv('200117_map_to_bact_contigs_nofilter.tsv', sep ='\t')

# make the columns readable
df_covtab.columns = df_covtab.columns.str.split('_sortedIndexed.bam').str[0] + ''


# remove lengths from the coverage table, otherwise we have col length twice
del df_covtab['length']

In [18]:
# merge coverage table and the norm factors
df_covtab_avg_len = pd.merge(df_covtab, df_otu_bin_len_mag, on='bact_contig', how='outer')

# copy the coverage table
df_copy = df_covtab_avg_len.copy()

# drop certain columns from this df so we don't have them duplicate
del df_copy['length']
del df_copy['total_len_bin']


# multiply all numbers in df with the bin normalization factor for that scaffold
df_copy_num = df_copy[df_copy.select_dtypes(include=['number']).columns].mul(df_copy.bin_norm_factor, 0)

# put bacterial name and magbin back to it
df_copy[df_copy_num.columns] = df_copy_num


In [20]:
# # make a new df from df copy, whwre we transpose df_copy
df2 = df_copy.set_index('bact_contig').T

# # reset the index so that the index is not the sample name
df2 = df2.reset_index()



In [21]:
# rename the index to col_name so that its no longer named bac_contig bc we have to merge on that
df2.columns.name = 'col_name'


In [22]:
# # open normalization_reads for num of reads per dataset
df_norm = pd.read_csv('normalization_reads.csv',sep=',')

#reset the index of the otu table so that we don't have a problem when merging
df2 = df2.reset_index()

df2=df2.rename(columns = {'index':'bact_contig'})



In [23]:
# merge the 2 dfs so that we have can normalize
df_norm_otu = pd.merge(df_norm, df2, on='bact_contig', how='outer')



In [26]:
# # write to file 
# df_norm_otu.to_csv('norm_factor_inclded.csv', index=False) 

# del df_norm_otu['level_0']
# df_norm_otu

In [27]:

# multiply all numbers in df with the bin normalization factor for that scaffold
df_norm_otu_normalized = df_norm_otu[df_norm_otu.select_dtypes(include=['number']).columns].mul(df_norm_otu.norm_factor_reads, 0)

# put bacterial name and magbin back to it
df_copy[df_copy_num.columns] = df_copy_num

In [28]:

# df_read[df_read.columns] = df_read

# df_read

#df_norm_otu.apply(pd.to_numeric, errors='ignore')

df_norm_otu2 = df_norm_otu.T

In [30]:
# I did stuff in excel to this table because I couldn't figure it out in pandas
# I normalized for number of reads
df_norm_otu2.to_csv('200205_norm_factor_incldedTrans.csv') 

In [32]:
# open the csv made in excel
df_normalized = pd.read_csv('200205_bOTU_table_normalized.txt',sep='\t')

In [33]:
# Make a list of all magbibs
unique_bins = df_normalized["magbin"].unique()

# # values of ≥ 0.25× were retained; lower values were converted to zero
# #num = df_normalized._get_numeric_data()

# make 0 into NaN for the normalizing
df_normalized = df_normalized.replace(0, np.NaN)

# calculate mean abundance for every bin
df_bin_abundance = df_normalized.groupby(['magbin']).mean()

# make nans into zero's again
df_bin_abundance = df_bin_abundance.fillna(0)


# values of ≥ 0.25× were retained; lower values were converted to zero
num = df_bin_abundance._get_numeric_data()
num[num < 0.25] = 0

# write bin abundance table to csv
df_bin_abundance.to_csv('200205_bin_abundances.csv') 