# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
import subprocess
import re

# Analysis

In [3]:
# Load csv
df_bru_a1 = pd.read_csv("./counts_bru_a1.csv")
df_bru_a2 = pd.read_csv("./counts_bru_a2.csv")
df_bru_b = pd.read_csv("./counts_bru_b.csv")

df_bru_a1.head()

df_list = [(df_bru_a1, 'bru_a1'), (df_bru_a2, 'bru_a2'), (df_bru_b, 'bru_b')]

In [4]:
# take the species then convert to tsv

for (df, name) in df_list:
    df_species = df.copy()

    for i, row in df_species.iterrows():
        title = df_species.at[i,"organism"]
        new_title = "_".join(title.split("|")[-1].split()[:2]) 
        df_species.at[i,"organism"] = new_title # crude parsing

    df_species = df_species.groupby('organism', as_index=False).sum().sort_values(by="count", ascending=False)

    # convert to tsv
    df_species['organism'].to_csv(f"species_{name}.tsv", sep="\t")

In [5]:
# df_species['organism'].to_csv("test.tsv", sep="\t")

## Analisis Kelimpahan dan Keanekaragaman Mikroba dengan Metode Statistika

In [None]:
# Shannon Diversity Index

df_shannon = df_species.copy()

# calculate proportions
total = df_shannon['count'].sum()
# print(total)
for i, row in df_shannon.iterrows():
    species = df_shannon.at[i,"organism"]
    count = df_shannon.at[i,"count"]
    proportion = count/total
    df_shannon.at[i,"proportion"] = proportion

    # calculate natural log of proportion
    nat_log_p = np.log(proportion)
    df_shannon.at[i,"ln(p)"] = nat_log_p

    # calculate proportion * their natural log
    p_nat_log_p = nat_log_p*proportion
    df_shannon.at[i,"p*ln(p)"] = p_nat_log_p


Unnamed: 0,organism,count,proportion,ln(p),p*ln(p)
1540,Hydrocarboniphaga_daqingensis,2480,0.033597,-3.393317,-0.114005
2081,Methylotenera_mobilis,1353,0.018329,-3.999251,-0.073304
2281,Nevskia_persephonica,1318,0.017855,-4.02546,-0.071875
2683,Pelotomaculum_thermopropionicum,1266,0.017151,-4.065713,-0.06973
2748,Piscinibacter_terrae,1140,0.015444,-4.170547,-0.064409


In [None]:
df_shannon.head()

Unnamed: 0,organism,count,proportion,ln(p),p*ln(p)
1540,Hydrocarboniphaga_daqingensis,2480,0.033597,-3.393317,-0.114005
2081,Methylotenera_mobilis,1353,0.018329,-3.999251,-0.073304
2281,Nevskia_persephonica,1318,0.017855,-4.02546,-0.071875
2683,Pelotomaculum_thermopropionicum,1266,0.017151,-4.065713,-0.06973
2748,Piscinibacter_terrae,1140,0.015444,-4.170547,-0.064409


In [None]:
# calculate shannon diversity
shannon_diversity = -df_shannon["p*ln(p)"].sum()
print(shannon_diversity)

6.5217510917527095


## Analisis Fungsi Mikroba yang Terdapat pada Dataset dengan Prediksi Jalur Fungsional

In [6]:
# run faprotax to match species with function (source: https://pages.uoregon.edu/slouca/LoucaLab/archive/FAPROTAX/lib/php/index.php)

for (_, name) in df_list:
    cmd = [
        "python", "collapse_table.py",
        "-i", f"species_{name}.tsv",
        "-o", f"out_{name}.tsv",
        "-g", "FAPROTAX.txt",
        "-d", "organism",
        "-v",
        "-r", f"report_{name}.txt",
        "-f"
    ]
    
    subprocess.run(cmd)

In [7]:
# use regex to get result

# get summary of functions
pattern = r"(\w+):\s+(\d+)\s+records"
matches_bru_a1 = []
for i, line in enumerate(open(f'report_bru_a1.txt')):
    for match in re.finditer(pattern, line):
        # print(f'Found on line {i+1}: {match.group()}')
        matches_bru_a1.append(match)

matches_bru_a2 = []
for i, line in enumerate(open(f'report_bru_a2.txt')):
    for match in re.finditer(pattern, line):
        # print(f'Found on line {i+1}: {match.group()}')
        matches_bru_a2.append(match)

matches_bru_b = []
for i, line in enumerate(open(f'report_bru_b.txt')):
    for match in re.finditer(pattern, line):
        # print(f'Found on line {i+1}: {match.group()}')
        matches_bru_b.append(match)

In [None]:
df_function = pd.DataFrame(columns=['function','count'])

for match in matches_bru_a1:
    function = match.group(1)
    num_record = int(match.group(2))
    
    if (int(num_record) == 0):
        continue

    # append dataframe
    df_function.loc[len(df_function)] = [function,num_record]

df_function = df_function.sort_values(by='count', ascending=False)
df_function.head()

Unnamed: 0,function,count
71,chemoheterotrophy,254
35,aerobic_chemoheterotrophy,109
56,nitrate_reduction,94
34,fermentation,78
46,animal_parasites_or_symbionts,58


## Correlate Function with Microbe Count

In [112]:
df_correlation = df_function.copy().set_index('function')
df_correlation['sum_proportion'] = 0.0
df_shannon_copy = df_shannon.copy().set_index('organism')

# df_correlation.head()
# df_shannon_copy.head()

function_pattern = r"#\s+(\w+)\s+\(\d+\s+records\):"
species_pattern = r"\s+(\w+)"
for i, line in enumerate(open(f'report_bru_a1.txt')):
    match_function = re.match(function_pattern, line)
    if (match_function):
        # remember function
        function = match_function.group(1)

    species_match = re.match(species_pattern, line)
    if (species_match):
        species = species_match.group().strip()
        # print(function, species)

        # get species proportion, function should match last stored function
        result = df_shannon_copy.loc[species]
        proportion = result['proportion']

        # add proportion to df
        df_correlation.at[function, 'sum_proportion'] += proportion
    
print(df_correlation['sum_proportion'].sum()) # check sum of proportion is one (or close to one if some organism isnt found)

df_correlation = df_correlation.sort_values(by='sum_proportion', ascending=False)
df_correlation.head()

df_correlation.to_csv("functions_bru_a1.csv")

0.9950281781727537


### Other dataset

In [113]:
# bru_a2
df_function = pd.DataFrame(columns=['function','count'])

for match in matches_bru_a2:
    function = match.group(1)
    num_record = int(match.group(2))
    
    if (int(num_record) == 0):
        continue

    # append dataframe
    df_function.loc[len(df_function)] = [function,num_record]

df_function = df_function.sort_values(by='count', ascending=False)
df_function.head()
    
df_correlation = df_function.copy().set_index('function')
df_correlation['sum_proportion'] = 0.0
df_shannon_copy = df_shannon.copy().set_index('organism')

# df_correlation.head()
# df_shannon_copy.head()

function_pattern = r"#\s+(\w+)\s+\(\d+\s+records\):"
species_pattern = r"\s+(\w+)"
for i, line in enumerate(open(f'report_bru_a2.txt')):
    match_function = re.match(function_pattern, line)
    if (match_function):
        # remember function
        function = match_function.group(1)

    species_match = re.match(species_pattern, line)
    if (species_match):
        species = species_match.group().strip()
        # print(function, species)

        # get species proportion, function should match last stored function
        result = df_shannon_copy.loc[species]
        proportion = result['proportion']

        # add proportion to df
        df_correlation.at[function, 'sum_proportion'] += proportion
    
print(df_correlation['sum_proportion'].sum()) # check sum of proportion is one (or close to one if some organism isnt found)

df_correlation = df_correlation.sort_values(by='sum_proportion', ascending=False)
df_correlation.head()

df_correlation.to_csv("functions_bru_a2.csv")

1.0498401430584154


In [114]:
# bru_b
df_function = pd.DataFrame(columns=['function','count'])

for match in matches_bru_b:
    function = match.group(1)
    num_record = int(match.group(2))
    
    if (int(num_record) == 0):
        continue

    # append dataframe
    df_function.loc[len(df_function)] = [function,num_record]

df_function = df_function.sort_values(by='count', ascending=False)
df_function.head()
    
df_correlation = df_function.copy().set_index('function')
df_correlation['sum_proportion'] = 0.0
df_shannon_copy = df_shannon.copy().set_index('organism')

# df_correlation.head()
# df_shannon_copy.head()

function_pattern = r"#\s+(\w+)\s+\(\d+\s+records\):"
species_pattern = r"\s+(\w+)"
for i, line in enumerate(open(f'report_bru_b.txt')):
    match_function = re.match(function_pattern, line)
    if (match_function):
        # remember function
        function = match_function.group(1)

    species_match = re.match(species_pattern, line)
    if (species_match):
        species = species_match.group().strip()
        # print(function, species)

        # get species proportion, function should match last stored function
        result = df_shannon_copy.loc[species]
        proportion = result['proportion']

        # add proportion to df
        df_correlation.at[function, 'sum_proportion'] += proportion
    
print(df_correlation['sum_proportion'].sum()) # check sum of proportion is one (or close to one if some organism isnt found)

df_correlation = df_correlation.sort_values(by='sum_proportion', ascending=False)
df_correlation.head()

df_correlation.to_csv("functions_bru_b.csv")

1.0710279614175784
