In [1]:
%cd /Users/yehormishchyriak/Desktop/BonhamLab/summer2025/microbiome2function

/Users/yehormishchyriak/Desktop/BonhamLab/summer2025/microbiome2function


In [2]:
import pandas as pd
import M2F

/Users/yehormishchyriak/Desktop/BonhamLab/summer2025/microbiome2function/M2F/dependencies/go-basic.obo: fmt(1.2) rel(2025-07-22) 43,230 Terms


In [3]:
# ENV
from os import getenv
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
M2F.configure_logging(getenv("LOGS_DIR"))

# Load and clean the data

In [5]:
df = pd.read_csv(getenv("FETCHED_DATA"))

In [6]:
df.drop(columns=["Protein families", "Rhea ID"], inplace=True) # these are redundant columns -- we won't use them in training

In [7]:
clean_df = M2F.clean_cols(df,
                          col_names=["Domain [FT]",
                                     "Domain [CC]",
                                     "Gene Ontology (molecular function)",
                                     "Gene Ontology (biological process)",
                                     "Function [CC]",
                                     "Catalytic activity",
                                     "EC number",
                                     "Pathway",
                                     "Cofactor",
                                    "Sequence"],
                          apply_norms={"Domain [FT]" : False,
                                    "Domain [CC]" : True,
                                    "Gene Ontology (molecular function)" : False,
                                    "Gene Ontology (biological process)" : False,
                                    "Function [CC]" : True,
                                    "Catalytic activity" : False,
                                    "EC number" : False,
                                    "Pathway" : True,
                                    "Cofactor" : False,
                                    "Sequence" : False})

In [8]:
clean_df_portion = clean_df.head(n=25).copy()

# Initialize embedding utils and encode all of the data

In [9]:
txt_embedder = M2F.FreeTXTEmbedder(getenv("OPENAI_API_KEY"), model="LARGE_OPENAI_MODEL",
                                   cache_file_path=getenv("DB"), caching_mode="APPEND")
aa_embedder = M2F.AAChainEmbedder()

In [10]:
M2F.embed_ft_domains(clean_df_portion, aa_embedder, inplace=True)
M2F.embed_AAsequences(clean_df_portion, aa_embedder, inplace=True)
M2F.embed_freetxt_cols(clean_df_portion, ["Domain [CC]", "Function [CC]", "Catalytic activity", "Pathway"], txt_embedder, inplace=True)
M2F.encode_go(clean_df_portion, "Gene Ontology (molecular function)", coverage_target=0.9, inplace=True)
M2F.encode_go(clean_df_portion, "Gene Ontology (biological process)", coverage_target=0.9, inplace=True)
M2F.encode_ec(clean_df_portion, "EC number", inplace=True)
M2F.encode_multihot(clean_df_portion, "Cofactor", inplace=True)

clean_df_portion.sort_values(by="Entry", inplace=True)
clean_df_portion.sort_index(axis=1, inplace=True)

In [11]:
clean_df_portion

Unnamed: 0,Catalytic activity,Cofactor,Domain [CC],Domain [FT],EC number,Entry,Function [CC],Gene Ontology (biological process),Gene Ontology (molecular function),Pathway,Sequence
7,,"(1, 2)",,"[0.13379566, -0.060112715, 0.49114436, 0.06966...","(2,)",A0A0J1ZC23,"[-0.05099844, -0.00023270496, -0.0110715125, -...","(1,)","(1, 5, 6)",,"[0.10255164, -0.10563319, 0.38230208, 0.127237..."
23,"[-0.0154730175, -0.0021807319, -0.0101796165, ...",,,"[0.40896338, -0.035719194, -0.1859013, -0.2392...","(2,)",A0A0N7J7Y2,,"(1,)","(1,)",,"[0.2541889, -0.339463, 0.113045506, -0.2062210..."
20,,"(0,)",,"[0.36987135, -0.13736856, -0.16509527, -0.2448...","(2,)",A0A0P0GHV0,,"(1,)","(1,)",,"[0.2930923, -0.4519677, -0.12289334, 0.1196395..."
14,"[-0.010813522, -0.00087630766, 0.0063198884, 0...",,,"[0.057893768, -0.26055378, -0.5186857, -0.2708...","(4,)",A0A173XL87,,"(1,)","(1, 4, 7)",,"[0.10617826, -0.22706063, -0.49131176, -0.0819..."
21,,,,,,A0A173YBD4,,"(1, 11)","(1,)",,"[0.5116541, -0.27717066, -0.15654404, 0.004170..."
9,"[0.001421212, 0.038292687, -0.005355382, 0.003...",,,"[-1.0119891, -0.85459846, -0.8319432, 0.615693...","(1,)",A0A174FMU0,"[0.017286675, 0.028886942, 0.001251799, -0.001...","(1,)","(2, 3)","[-0.009604247, 0.056476507, -0.011268787, 0.00...","[0.152202, -0.07339708, -0.23733713, 0.5565645..."
10,"[-0.01820535, -0.013099452, -0.003678478, 0.03...",,,"[0.6780791, -0.62408143, -0.74252397, 0.113041...","(2,)",A0A174G8M5,,"(1,)","(1,)",,"[0.5707834, -0.6171031, -0.6198442, 0.01490820..."
15,"[-0.010813522, -0.00087630766, 0.0063198884, 0...",,,"[0.14752838, -0.19741392, -0.13672641, -0.2462...","(2,)",A0A176U1F6,"[-0.016581077, 0.014960919, 0.004221898, -0.03...","(1,)","(4, 7)",,"[0.22552899, -0.06948886, -0.18292694, -0.0361..."
16,"[-0.0065220785, 0.016731102, 0.003068746, 0.03...",,,"[-0.020345965, -0.21926206, -0.50934, 0.237177...","(1,)",A0A1I0N6K0,,,"(0,)",,"[-0.024644883, -0.54752254, -0.4563743, 0.0705..."
6,"[0.0042206035, 0.018596297, -0.0064877863, 0.0...",,,,"(1,)",A0A2D1TVM1,"[-0.02175193, 0.00042575714, -0.0032149185, 0....","(1,)","(1,)","[-0.015242205, 0.031689215, -0.0056352727, 0.0...","[0.54539245, -0.032479122, -0.1317679, 0.93089..."


# Save the df and then load to see the clean and numerically encoded data

In [12]:
M2F.save_df(clean_df_portion, name="example")

In [13]:
M2F.load_df("/Users/yehormishchyriak/Desktop/BonhamLab/summer2025/microbiome2function/example.zip")

Unnamed: 0,Catalytic activity,Cofactor,Domain [CC],Domain [FT],EC number,Entry,Function [CC],Gene Ontology (biological process),Gene Ontology (molecular function),Pathway,Sequence
0,,"(1, 2)",,"[0.13379566, -0.060112715, 0.49114436, 0.06966...","(2,)",A0A0J1ZC23,"[-0.05099844, -0.00023270496, -0.0110715125, -...","(1,)","(1, 5, 6)",,"[0.10255164, -0.10563319, 0.38230208, 0.127237..."
1,"[-0.0154730175, -0.0021807319, -0.0101796165, ...",,,"[0.40896338, -0.035719194, -0.1859013, -0.2392...","(2,)",A0A0N7J7Y2,,"(1,)","(1,)",,"[0.2541889, -0.339463, 0.113045506, -0.2062210..."
2,,"(0,)",,"[0.36987135, -0.13736856, -0.16509527, -0.2448...","(2,)",A0A0P0GHV0,,"(1,)","(1,)",,"[0.2930923, -0.4519677, -0.12289334, 0.1196395..."
3,"[-0.010813522, -0.00087630766, 0.0063198884, 0...",,,"[0.057893768, -0.26055378, -0.5186857, -0.2708...","(4,)",A0A173XL87,,"(1,)","(1, 4, 7)",,"[0.10617826, -0.22706063, -0.49131176, -0.0819..."
4,,,,,,A0A173YBD4,,"(1, 11)","(1,)",,"[0.5116541, -0.27717066, -0.15654404, 0.004170..."
5,"[0.001421212, 0.038292687, -0.005355382, 0.003...",,,"[-1.0119891, -0.85459846, -0.8319432, 0.615693...","(1,)",A0A174FMU0,"[0.017286675, 0.028886942, 0.001251799, -0.001...","(1,)","(2, 3)","[-0.009604247, 0.056476507, -0.011268787, 0.00...","[0.152202, -0.07339708, -0.23733713, 0.5565645..."
6,"[-0.01820535, -0.013099452, -0.003678478, 0.03...",,,"[0.6780791, -0.62408143, -0.74252397, 0.113041...","(2,)",A0A174G8M5,,"(1,)","(1,)",,"[0.5707834, -0.6171031, -0.6198442, 0.01490820..."
7,"[-0.010813522, -0.00087630766, 0.0063198884, 0...",,,"[0.14752838, -0.19741392, -0.13672641, -0.2462...","(2,)",A0A176U1F6,"[-0.016581077, 0.014960919, 0.004221898, -0.03...","(1,)","(4, 7)",,"[0.22552899, -0.06948886, -0.18292694, -0.0361..."
8,"[-0.0065220785, 0.016731102, 0.003068746, 0.03...",,,"[-0.020345965, -0.21926206, -0.50934, 0.237177...","(1,)",A0A1I0N6K0,,,"(0,)",,"[-0.024644883, -0.54752254, -0.4563743, 0.0705..."
9,"[0.0042206035, 0.018596297, -0.0064877863, 0.0...",,,,"(1,)",A0A2D1TVM1,"[-0.02175193, 0.00042575714, -0.0032149185, 0....","(1,)","(1,)","[-0.015242205, 0.031689215, -0.0056352727, 0.0...","[0.54539245, -0.032479122, -0.1317679, 0.93089..."
