In [1]:
import rdflib
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal, XSD

# Create graph
g = Graph()

# Define Namespaces
schema  = Namespace("http://schema.org/")
rdf     = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
rdfs    = Namespace("http://www.w3.org/2000/01/rdf-schema#")
owl     = Namespace("http://www.w3.org/2002/07/owl#")
xsd     = Namespace("http://www.w3.org/2001/XMLSchema#")
fo      = Namespace("https://purl.org/ontology/fo/")
dbr     = Namespace("http://dbpedia.org/resource/")
ex      = Namespace("http://example.org/")
foodon = Namespace("http://purl.obolibrary.org/obo/FOODON_")
wikidata = Namespace("http://www.wikidata.org/entity/")
OBO = Namespace("http://purl.obolibrary.org/obo/")

g.bind("schema", schema)
g.bind("rdf", rdf)
g.bind("rdfs", rdfs)
g.bind("owl", owl)
g.bind("xsd", xsd)
g.bind("fo", fo)
g.bind("dbr",dbr)
g.bind("ex", ex)
g.bind("obo", OBO)
g.bind("wikidata", wikidata)

# List of subclass relationships
subclass_relationships = [
    # Subclass Macronutrients
    ("Weight","Macronutrients"),("Calories","Macronutrients"),("Fat","Macronutrients"),("Protein","Macronutrients"),("Carbohydrate","Macronutrients"),("Sugars","Macronutrients"),("Fiber","Macronutrients"),("Cholesterol","Macronutrients"),("SaturatedFats","Macronutrients"),("NetCarbs","Macronutrients"),("TransFattyAcids","Macronutrients"),

    # Subclass Minerals
    ("Calcium", "Minerals"), ("Iron", "Minerals"), ("Potassium", "Minerals"),
    ("Magnesium", "Minerals"), ("Phosphorus", "Minerals"), ("Sodium", "Minerals"),
    ("Zinc", "Minerals"), ("Copper", "Minerals"), ("Manganese", "Minerals"),
    ("Selenium", "Minerals"), ("Fluoride", "Minerals"), ("Molybdenum", "Minerals"),
    ("Iodine", "Minerals"), ("Chloride", "Minerals"), ("Chromium", "Minerals"),

    # Subclass Vitamins
    ("VitaminA", "Vitamins"), ("VitaminC", "Vitamins"), ("ThiaminB1", "Vitamins"),
    ("RiboflavinB2", "Vitamins"), ("NiacinB3", "Vitamins"), ("VitaminB5", "Vitamins"),
    ("VitaminB6", "Vitamins"), ("Biotin", "Vitamins"), ("Folate", "Vitamins"),
    ("Folicacid", "Vitamins"), ("Foodfolate", "Vitamins"), ("FolateDFE", "Vitamins"),
    ("Choline", "Vitamins"), ("VitaminB12", "Vitamins"), ("Retinol", "Vitamins"),
    ("Carotenebeta", "Vitamins"), ("Carotenealpha", "Vitamins"),
    ("Cryptoxanthinbeta", "Vitamins"), ("VitaminAIU", "Vitamins"),
    ("Lycopene", "Vitamins"), ("LutZeaxanthin", "Vitamins"), ("VitaminE", "Vitamins"),
    ("VitaminD", "Vitamins"), ("VitaminD2", "Vitamins"), ("VitaminD3", "Vitamins"),
    ("VitaminDIU", "Vitamins"), ("VitaminK", "Vitamins"), ("VitaminK1", "Vitamins"),
    ("Menaquinone4", "Vitamins"),

    # Subclass Others
    ("Water", "OtherComponent"), ("Ash", "OtherComponent"), ("Alcohol", "OtherComponent"),
    ("Caffeine", "OtherComponent"), ("Theobromine", "OtherComponent"), ("PRALscore", "OtherComponent"),

    # Subclass Carbohydrates
    ("SolubleFiber", "Carbohydrates"), ("InsolubleFiber", "Carbohydrates"), ("AddedSugar", "Carbohydrates"),
    ("Sucrose", "Carbohydrates"), ("Glucose", "Carbohydrates"), ("Fructose", "Carbohydrates"),
    ("Lactose", "Carbohydrates"), ("Maltose", "Carbohydrates"), ("Galactose", "Carbohydrates"),
    ("Strarch", "Carbohydrates"), ("otherSugar", "Carbohydrates"), ("SugarAlcohols", "Carbohydrates"),

    # Subclass Fats
    ("TotalMonounsaturated","Fats"),("TotalPolyunsaturated","Fats"),("Omega3s","Fats"),("Omega6s","Fats"),("Omega3toOmega6Ratio","Fats"),("Omega6toOmega3Ratio","Fats"),

    # Subclass Omega3Fat
    ("alphaLinolenicAcid", "Omega3Fat"), ("EicosapentaenoicAcid", "Omega3Fat"), ("DocosapentaenoicAcid", "Omega3Fat"), ("DocosahexaenoicAcid", "Omega3Fat"), ("n3EicosatrienoicAcid", "Omega3Fat"),

    # Subclass Omega6Fat
    ("alphaLinolenicAcid", "Omega6Fat"),("gammaLinolenicAcid", "Omega6Fat"),("ciscisn6EicosadienoicAcid", "Omega6Fat"),("DihomogammalinolenicAcid", "Omega6Fat"),("ArachidonicAcid", "Omega6Fat"),("Butyric", "Omega6Fat"),("Caproic", "Omega6Fat"),("CaprylicAcid", "Omega6Fat"), ("CapricAcid", "Omega6Fat"), ("LauricAcid", "Omega6Fat"), ("TridecanoicAcid", "Omega6Fat"), ("MyristicAcid", "Omega6Fat"),("PentadecanoicAcid", "Omega6Fat"),("PalmiticAcid", "Omega6Fat"),("MargaricAcid", "Omega6Fat"),("StearicAcid", "Omega6Fat"),("ArachidicAcid", "Omega6Fat"),("BehenicAcid", "Omega6Fat"),("LignocericAcid", "Omega6Fat"),

    # Subclass Monounsaturated
    ("MyristoleicAcid", "MonounsaturatedFat"), ("PentadecenoicAcid", "MonounsaturatedFat"), ("PalmitoleicAcid", "MonounsaturatedFat"),
    ("cisPalmitoleicAcid", "MonounsaturatedFat"), ("HeptadecenoicAcid", "MonounsaturatedFat"), ("OleicAcid", "MonounsaturatedFat"),
    ("cisOleicAcid", "MonounsaturatedFat"), ("cisVaccenicAcid", "MonounsaturatedFat"), ("GadoleicAcid", "MonounsaturatedFat"), ("DocosenoicAcid", "MonounsaturatedFat"), ("cisDocosenoicAcid", "MonounsaturatedFat"), ("cisTetracosenoicAcid", "MonounsaturatedFat"),

    # Subclass Polyunsaturated
    ("OctadecadienoicAcid","PolyunsaturatedFat"),("ConjugatedLinoleicAcid","PolyunsaturatedFat"),("iOctadecadienoicAcid","PolyunsaturatedFat"),("undifferentiatedOctadecatrienoic","PolyunsaturatedFat"),("OctadecatrienoicAcid","PolyunsaturatedFat"),("ParinaricAcid","PolyunsaturatedFat"),("EicosatrienoicAcid","PolyunsaturatedFat"),("EicosatetraenoicAcid","PolyunsaturatedFat"),("HeneicosapentaenoicAcid","PolyunsaturatedFat"),("22:4","PolyunsaturatedFat"),

    # Subclass TransFat
    ("TransPolyenoicFats","TransFat"),("TransMonoenoicFats","TransFat"),("transPalmitoleicAcid","TransFat"),("transOleicAcid","TransFat"),("transDocosenoicAcid","TransFat"),("transOctadecadienoicAcid","TransFat"),("transtransOctadecadienoicAcid","TransFat"),

    # Subclass Phytosterols
    ("TotalPhytosterols","Phytosterols"),("Stigmasterol","Phytosterols"),("Campesterol","Phytosterols"),("Betasitosterol","Phytosterols"),

    # Subclass Fats
    ("Omega3Fat", "Fats"), ("Omega6Fat", "Fats"), ("MonounsaturatedFat", "Fats"),("PolyunsaturatedFat", "Fats"), ("TransFat", "Fats"), ("Phytosterols", "Fats"),

    # Subclass EssentialAminoAcid
    ("Histidine","EssentialAminoAcid"),("Isoleucine","EssentialAminoAcid"),("Leucine","EssentialAminoAcid"),("Lysine","EssentialAminoAcid"),("Methionine","EssentialAminoAcid"),("Phenylalanine","EssentialAminoAcid"),("Threonine","EssentialAminoAcid"),("Tryptophan","EssentialAminoAcid"),("Valine","EssentialAminoAcid"),

    # Subclass ConditionallyEssentialAminoAcid
    ("Arginine","ConditionallyEssentialAminoAcid"),("Cystine","ConditionallyEssentialAminoAcid"),("Glycine","ConditionallyEssentialAminoAcid"),("Proline","ConditionallyEssentialAminoAcid"),("Tyrosine","ConditionallyEssentialAminoAcid"),

    # Subclass NonEssentialAminoAcid
    ("Alanine","NonEssentialAminoAcid"),("Asparticacid","NonEssentialAminoAcid"),("Betaine","NonEssentialAminoAcid"),("Glutamicacid","NonEssentialAminoAcid"),("Hydroxyproline","NonEssentialAminoAcid"),("Serine","NonEssentialAminoAcid"),

    # Subclass Amino Acids
    ("EssentialAminoAcid", "AminoAcids"),
    ("ConditionallyEssentialAminoAcid", "AminoAcids"),
    ("NonEssentialAminoAcid", "AminoAcids"),

    # General classes
    ("Macronutrients", "Food"),
    ("Minerals", "Food"),
    ("Vitamins", "Food"),
    ("OtherComponent", "Food"),
    ("Carbohydrates", "Food"),
    ("Fats", "Food"),
    ("AminoAcids", "Food"),
]

# Create and define each class
all_classes = set()
for subclass, superclass in subclass_relationships:
    subclass_uri = getattr(ex, subclass)
    superclass_uri = getattr(ex, superclass)

    # Declare both subclass and superclass as OWL classes
    if subclass not in all_classes:
        g.add((subclass_uri, RDF.type, OWL.Class))
        all_classes.add(subclass)
        g.add((ex.Weight, RDFS.subClassOf, subclass_uri))
        g.add((ex.DailyValue, RDFS.subClassOf, subclass_uri))
        g.add((ex.unite, RDFS.subClassOf, subclass_uri))

    if superclass not in all_classes:
        g.add((superclass_uri, RDF.type, OWL.Class))
        all_classes.add(superclass)

    # Establish hierarchy
    g.add((subclass_uri, RDFS.subClassOf, superclass_uri))

# Define data property for food name
g.add((ex.hasName, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasName, RDFS.range, xsd.string))

# Define data property for food weight (e.g., in grams)
g.add((ex.hasWeight, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasWeight, RDFS.range, xsd.float))

# Define data property for food weight (e.g., in grams)
g.add((ex.hasUnite, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasUnite, RDFS.range, xsd.string))

g.add((ex.hasDailyValue, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasDailyValue, RDFS.range, xsd.float))

<Graph identifier=Nf24f24f26e024d2eb114a501680a1088 (<class 'rdflib.graph.Graph'>)>

# Defining vitamins, amino acids, minerals, etc from existing ontologies

Macronutrients

In [2]:
calories = URIRef(OBO["NCIT_C15646"])
fats = URIRef(OBO["NCIT_C15224"])
protein = URIRef(OBO["NCIT_C15920"])
carbohydrates = URIRef(OBO["NCIT_C15223"])
sugars = URIRef(OBO["NCIT_C71939"])
fiber = URIRef(OBO["NCIT_C15225"])
cholesterol = URIRef(OBO["NCIT_C15205"])
saturated_fats = URIRef(OBO["NCIT_C68421"])
net_carbs = URIRef(OBO["NCIT_C68492"])
trans_fatty_acids = URIRef(OBO["NCIT_C68440"])

In [3]:
g.add((ex.Calories, OWL.equivalentClass, calories))
g.add((ex.Fat, OWL.equivalentClass, fats))
g.add((ex.Protein, OWL.equivalentClass, protein))
g.add((ex.Carbohydrate, OWL.equivalentClass, carbohydrates))
g.add((ex.Sugars, OWL.equivalentClass, sugars))
g.add((ex.Fiber, OWL.equivalentClass, fiber))
g.add((ex.Cholesterol, OWL.equivalentClass, cholesterol))
g.add((ex.SaturatedFats, OWL.equivalentClass, saturated_fats))
g.add((ex.NetCarbs, OWL.equivalentClass, net_carbs))
g.add((ex.TransFattyAcids, OWL.equivalentClass, trans_fatty_acids))

<Graph identifier=Nf24f24f26e024d2eb114a501680a1088 (<class 'rdflib.graph.Graph'>)>

Minerals

In [4]:
# calcium = URIRef(OBO["NCIT_C68241"])
# iron = URIRef(OBO["NCIT_C68256"])
# potassium = URIRef(OBO["NCIT_C68279"])
# magnesium = URIRef(OBO["NCIT_C68264"])
# phosphorus = URIRef(OBO["NCIT_C68277"])
# sodium = URIRef(OBO["NCIT_C68287"])
# zinc = URIRef(OBO["NCIT_C68295"])
# copper = URIRef(OBO["NCIT_C68249"])
# manganese = URIRef(OBO["NCIT_C68266"])
# selenium = URIRef(OBO["NCIT_C68281"])
# fluoride = URIRef(OBO["NCIT_C68252"])
# molybdenum = URIRef(OBO["NCIT_C68272"])
# iodine = URIRef(OBO["NCIT_C68254"])
# chloride = URIRef(OBO["NCIT_C68243"])
# chromium = URIRef(OBO["NCIT_C68245"])

In [5]:
# g.add((ex.Calcium, OWL.equivalentClass, calcium))
# g.add((ex.Iron, OWL.equivalentClass, iron))
# g.add((ex.Potassium, OWL.equivalentClass, potassium))
# g.add((ex.Magnesium, OWL.equivalentClass, magnesium))
# g.add((ex.Phosphorus, OWL.equivalentClass, phosphorus))
# g.add((ex.Sodium, OWL.equivalentClass, sodium))
# g.add((ex.Zinc, OWL.equivalentClass, zinc))
# g.add((ex.Copper, OWL.equivalentClass, copper))
# g.add((ex.Manganese, OWL.equivalentClass, manganese))
# g.add((ex.Selenium, OWL.equivalentClass, selenium))
# g.add((ex.Fluoride, OWL.equivalentClass, fluoride))
# g.add((ex.Molybdenum, OWL.equivalentClass, molybdenum))
# g.add((ex.Iodine, OWL.equivalentClass, iodine))
# g.add((ex.Chloride, OWL.equivalentClass, chloride))
# g.add((ex.Chromium, OWL.equivalentClass, chromium))

Vitamins

In [6]:
# vitaminA = URIRef(OBO["NCIT_C938"])
# vitaminC = URIRef(OBO["NCIT_C68507"])
# thiaminB1 = URIRef(OBO["NCIT_C874"])
# riboflavinB2 = URIRef(OBO["NCIT_C808"])
# niacinB3 = URIRef(OBO["NCIT_C689"])
# vitaminB5 = URIRef(OBO["NCIT_C47783"])
# vitaminB6 = URIRef(OBO["NCIT_C1334"])
# biotin = URIRef(OBO["NCIT_C309"])
# folate = URIRef(OBO["NCIT_C1444"])
# folicacid = URIRef(OBO["NCIT_C510"])
# foodfolate = URIRef(OBO["NCIT_C68512"])
# folateDFE = URIRef(OBO["NCIT_C68513"])
# choline = URIRef(OBO["NCIT_C61674"])
# vitaminB12 = URIRef(OBO["NCIT_C173805"])
# retinol = URIRef(OBO["NCIT_C68302"])
# carotenebeta = URIRef(OBO["NCIT_C1016"])
# carotenealpha = URIRef(OBO["NCIT_C68304"])
# cryptoxanthinbeta = URIRef(OBO["NCIT_C68306"])
# #vitaminAIU = URIRef(OBO[""])
# lycopene = URIRef(OBO["NCIT_C2226"])
# lutzeaxanthin = URIRef(OBO["NCIT_C68310"])
# vitaminE = URIRef(OBO["NCIT_C942"])
# vitaminD = URIRef(OBO["NCIT_C941"])
# vitaminD2 = URIRef(OBO["NCIT_C29029"])
# vitaminD3 = URIRef(OBO["NCIT_C48194"])
# #vitaminDIU = URIRef(OBO[""])
# vitaminK = URIRef(OBO["NCIT_C943"])
# vitaminK1 = URIRef(OBO["NCIT_C29365"])
# menaquinone4 = URIRef(OBO["NCIT_C68319"])

In [7]:
# g.add((ex.VitaminA, OWL.equivalentClass, vitaminA))
# g.add((ex.VitaminC, OWL.equivalentClass, vitaminC))
# g.add((ex.ThiaminB1, OWL.equivalentClass, thiaminB1))
# g.add((ex.RiboflavinB2, OWL.equivalentClass, riboflavinB2))
# g.add((ex.NiacinB3, OWL.equivalentClass, niacinB3))
# g.add((ex.VitaminB5, OWL.equivalentClass, vitaminB5))
# g.add((ex.VitaminB6, OWL.equivalentClass, vitaminB6))
# g.add((ex.Biotin, OWL.equivalentClass, biotin))
# g.add((ex.Folate, OWL.equivalentClass, folate))
# g.add((ex.Folicacid, OWL.equivalentClass, folicacid))
# g.add((ex.Foodfolate, OWL.equivalentClass, foodfolate))
# g.add((ex.FolateDFE, OWL.equivalentClass, folateDFE))
# g.add((ex.Choline, OWL.equivalentClass, choline))
# g.add((ex.VitaminB12, OWL.equivalentClass, vitaminB12))
# g.add((ex.Retinol, OWL.equivalentClass, retinol))
# g.add((ex.Carotenebeta, OWL.equivalentClass, carotenebeta))
# g.add((ex.Carotenealpha, OWL.equivalentClass, carotenealpha))
# g.add((ex.Cryptoxanthinbeta, OWL.equivalentClass, cryptoxanthinbeta))
# #g.add((ex.VitaminAIU, OWL.equivalentClass, vitaminAIU))
# g.add((ex.Lycopene, OWL.equivalentClass, lycopene))
# g.add((ex.LutZeaxanthin, OWL.equivalentClass, lutzeaxanthin))
# g.add((ex.VitaminE, OWL.equivalentClass, vitaminE))
# g.add((ex.VitaminD, OWL.equivalentClass, vitaminD))
# g.add((ex.VitaminD2, OWL.equivalentClass, vitaminD2))
# g.add((ex.VitaminD3, OWL.equivalentClass, vitaminD3))
# #g.add((ex.VitaminDIU, OWL.equivalentClass, vitaminDIU))
# g.add((ex.VitaminK, OWL.equivalentClass, vitaminK))
# g.add((ex.VitaminK1, OWL.equivalentClass, vitaminK1))
# g.add((ex.Menaquinone4, OWL.equivalentClass, menaquinone4))

Essential amino acids

In [8]:
histidine = URIRef(OBO["OMIT_0007798"])
isoleucine = URIRef(OBO["OMIT_0008647"])
leucine = URIRef(OBO["OMIT_0009021"])
lysine = URIRef(OBO["OMIT_0009315"])
methionine = URIRef(OBO["OMIT_0009774"])
phenylalanine = URIRef(OBO["OMIT_0011637"])
threonine = URIRef(OBO["OMIT_0014757"])
tryptophan = URIRef(OBO["OMIT_0015201"])
valine = URIRef(OBO["OMIT_0015467"])

In [9]:
g.add((ex.Histidine, OWL.equivalentClass, histidine))
g.add((ex.Isoleucine, OWL.equivalentClass, isoleucine))
g.add((ex.Leucine, OWL.equivalentClass, leucine))
g.add((ex.Lysine, OWL.equivalentClass, lysine))
g.add((ex.Methionine, OWL.equivalentClass, methionine))
g.add((ex.Phenylalanine, OWL.equivalentClass, phenylalanine))
g.add((ex.Threonine, OWL.equivalentClass, threonine))
g.add((ex.Tryptophan, OWL.equivalentClass, tryptophan))
g.add((ex.Valine, OWL.equivalentClass, valine))

<Graph identifier=Nf24f24f26e024d2eb114a501680a1088 (<class 'rdflib.graph.Graph'>)>

# Generate graph

In [15]:
# Serialize and save the ontology
g.serialize("C:/Users/laure/OneDrive/Documents/GitHub/KG_project/nutrition_structure_short.ttl", format="turtle")

<Graph identifier=Nf24f24f26e024d2eb114a501680a1088 (<class 'rdflib.graph.Graph'>)>

In [16]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a NetworkX directed graph
G = nx.DiGraph()

# Add nodes and edges from RDF graph
for subj, pred, obj in g:
    G.add_node(str(subj), label=str(subj).split("/")[-1])  # Use last part of URI as label
    G.add_node(str(obj), label=str(obj).split("/")[-1])
    G.add_edge(str(subj), str(obj), label=str(pred).split("#")[-1])  # Use last part of predicate as edge label

# Draw the graph
nx.write_gexf(G, "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/structure_short.gexf")

# Add items to the graph

In [10]:
def separate_values_and_units(text):
    if(text == '' or text == '--%'):
        return [(0.0,'%')]
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return [(0.0,'g')]
    pattern = r"(\d*\.?\d*)([a-zA-Z%]*)"
    matches = re.findall(pattern, text)
    return [(value.strip(), unit.strip()) for value, unit in matches if value or unit]

In [11]:
Match_data_with_csv = [
    # Subclass Macronutrients
    ("Weight","Macronutrients"),("Calories","Macronutrients"),("Fat","Macronutrients"),("Protein","Macronutrients"),("Carbohydrate","Macronutrients"),("Sugars","Macronutrients"),("Fiber","Macronutrients"),("Cholesterol","Macronutrients"),("SaturatedFats","Macronutrients"),("NetCarbs","Macronutrients"),("TransFattyAcids","Macronutrients"),

    # Subclass Minerals
    ("Calcium", "Minerals"), ("Iron", "Minerals"), ("Potassium", "Minerals"),
    ("Magnesium", "Minerals"), ("Phosphorus", "Minerals"), ("Sodium", "Minerals"),
    ("Zinc", "Minerals"), ("Copper", "Minerals"), ("Manganese", "Minerals"),
    ("Selenium", "Minerals"), ("Fluoride", "Minerals"), ("Molybdenum", "Minerals"),
    ("Iodine", "Minerals"), ("Chloride", "Minerals"), ("Chromium", "Minerals"),

    # Subclass Vitamins
    ("VitaminA", "Vitamins"), ("VitaminC", "Vitamins"), ("ThiaminB1", "Vitamins"),
    ("RiboflavinB2", "Vitamins"), ("NiacinB3", "Vitamins"), ("VitaminB5", "Vitamins"),
    ("VitaminB6", "Vitamins"), ("Biotin", "Vitamins"), ("Folate", "Vitamins"),
    ("Folicacid", "Vitamins"), ("Foodfolate", "Vitamins"), ("FolateDFE", "Vitamins"),
    ("Choline", "Vitamins"), ("VitaminB12", "Vitamins"), ("Retinol", "Vitamins"),
    ("Carotenebeta", "Vitamins"), ("Carotenealpha", "Vitamins"),
    ("Cryptoxanthinbeta", "Vitamins"), ("VitaminAIU", "Vitamins"),
    ("Lycopene", "Vitamins"), ("LutZeaxanthin", "Vitamins"), ("VitaminE", "Vitamins"),
    ("VitaminD", "Vitamins"), ("VitaminD2", "Vitamins"), ("VitaminD3", "Vitamins"),
    ("VitaminDIU", "Vitamins"), ("VitaminK", "Vitamins"), ("VitaminK1", "Vitamins"),
    ("Menaquinone4", "Vitamins"),

    # Subclass Others
    ("Water", "OtherComponent"), ("Ash", "OtherComponent"), ("Alcohol", "OtherComponent"),
    ("Caffeine", "OtherComponent"), ("Theobromine", "OtherComponent"), ("PRALscore", "OtherComponent"),

    # Subclass Carbohydrates
    ("SolubleFiber", "Carbohydrates"), ("InsolubleFiber", "Carbohydrates"), ("AddedSugar", "Carbohydrates"),
    ("Sucrose", "Carbohydrates"), ("Glucose", "Carbohydrates"), ("Fructose", "Carbohydrates"),
    ("Lactose", "Carbohydrates"), ("Maltose", "Carbohydrates"), ("Galactose", "Carbohydrates"),
    ("Strarch", "Carbohydrates"), ("otherSugar", "Carbohydrates"), ("SugarAlcohols", "Carbohydrates"),

    # Subclass Fats
    ("TotalMonounsaturated","Fats"),("TotalPolyunsaturated","Fats"),("Omega3s","Fats"),("Omega6s","Fats"),("Omega3toOmega6Ratio","Fats"),("Omega6toOmega3Ratio","Fats"),

    # Subclass Omega3Fat
    ("alphaLinolenicAcid", "Omega3Fat"), ("EicosapentaenoicAcid", "Omega3Fat"), ("DocosapentaenoicAcid", "Omega3Fat"), ("DocosahexaenoicAcid", "Omega3Fat"), ("n3EicosatrienoicAcid", "Omega3Fat"),

    # Subclass Omega6Fat
    ("alphaLinolenicAcid", "Omega6Fat"),("gammaLinolenicAcid", "Omega6Fat"),("ciscisn6EicosadienoicAcid", "Omega6Fat"),("DihomogammalinolenicAcid", "Omega6Fat"),("ArachidonicAcid", "Omega6Fat"),("Butyric", "Omega6Fat"),("Caproic", "Omega6Fat"),("CaprylicAcid", "Omega6Fat"), ("CapricAcid", "Omega6Fat"), ("LauricAcid", "Omega6Fat"), ("TridecanoicAcid", "Omega6Fat"), ("MyristicAcid", "Omega6Fat"),("PentadecanoicAcid", "Omega6Fat"),("PalmiticAcid", "Omega6Fat"),("MargaricAcid", "Omega6Fat"),("StearicAcid", "Omega6Fat"),("ArachidicAcid", "Omega6Fat"),("BehenicAcid", "Omega6Fat"),("LignocericAcid", "Omega6Fat"),

    # Subclass Monounsaturated
    ("MyristoleicAcid", "MonounsaturatedFat"), ("PentadecenoicAcid", "MonounsaturatedFat"), ("PalmitoleicAcid", "MonounsaturatedFat"),
    ("cisPalmitoleicAcid", "MonounsaturatedFat"), ("HeptadecenoicAcid", "MonounsaturatedFat"), ("OleicAcid", "MonounsaturatedFat"),
    ("cisOleicAcid", "MonounsaturatedFat"), ("cisVaccenicAcid", "MonounsaturatedFat"), ("GadoleicAcid", "MonounsaturatedFat"), ("DocosenoicAcid", "MonounsaturatedFat"), ("cisDocosenoicAcid", "MonounsaturatedFat"), ("cisTetracosenoicAcid", "MonounsaturatedFat"),

    # Subclass Polyunsaturated
    ("OctadecadienoicAcid","PolyunsaturatedFat"),("ConjugatedLinoleicAcid","PolyunsaturatedFat"),("iOctadecadienoicAcid","PolyunsaturatedFat"),("undifferentiatedOctadecatrienoic","PolyunsaturatedFat"),("OctadecatrienoicAcid","PolyunsaturatedFat"),("ParinaricAcid","PolyunsaturatedFat"),("EicosatrienoicAcid","PolyunsaturatedFat"),("EicosatetraenoicAcid","PolyunsaturatedFat"),("HeneicosapentaenoicAcid","PolyunsaturatedFat"),("22:4","PolyunsaturatedFat"),

    # Subclass TransFat
    ("TransPolyenoicFats","TransFat"),("TransMonoenoicFats","TransFat"),("transPalmitoleicAcid","TransFat"),("transOleicAcid","TransFat"),("transDocosenoicAcid","TransFat"),("transOctadecadienoicAcid","TransFat"),("transtransOctadecadienoicAcid","TransFat"),

    # Subclass Phytosterols
    ("TotalPhytosterols","Phytosterols"),("Stigmasterol","Phytosterols"),("Campesterol","Phytosterols"),("Betasitosterol","Phytosterols"),

    # Subclass EssentialAminoAcid
    ("Histidine","EssentialAminoAcid"),("Isoleucine","EssentialAminoAcid"),("Leucine","EssentialAminoAcid"),("Lysine","EssentialAminoAcid"),("Methionine","EssentialAminoAcid"),("Phenylalanine","EssentialAminoAcid"),("Threonine","EssentialAminoAcid"),("Tryptophan","EssentialAminoAcid"),("Valine","EssentialAminoAcid"),

    # Subclass ConditionallyEssentialAminoAcid
    ("Arginine","ConditionallyEssentialAminoAcid"),("Cystine","ConditionallyEssentialAminoAcid"),("Glycine","ConditionallyEssentialAminoAcid"),("Proline","ConditionallyEssentialAminoAcid"),("Tyrosine","ConditionallyEssentialAminoAcid"),

    # Subclass NonEssentialAminoAcid
    ("Alanine","NonEssentialAminoAcid"),("Asparticacid","NonEssentialAminoAcid"),("Betaine","NonEssentialAminoAcid"),("Glutamicacid","NonEssentialAminoAcid"),("Hydroxyproline","NonEssentialAminoAcid"),("Serine","NonEssentialAminoAcid"),
]

In [12]:
from hashlib import sha256
from rdflib import URIRef

def generate_unique_iri(base_uri, food_uri, nutrient_name, measure):
    unique_id = sha256(f"{food_uri}_{nutrient_name}_{measure}".encode()).hexdigest()[:10]
    return URIRef(f"{base_uri}{nutrient_name.replace(' ', '_')}_{unique_id}")

In [13]:
def separate_values_and_units_boolean(text):
    if(text == '' or text == '--%'):
        return True
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return True

    return False

In [14]:
import csv
import re
import os
from rdflib import Graph, URIRef, Literal, XSD, RDF

folder_path = 'C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Nutrition/'
count = 0
correct_count = 0

# Define required nutrients
REQUIRED_NUTRIENTS = {
    "Weight", "Calories (kcal)", "Protein", "Histidine", "Isoleucine", "Leucine",
    "Lysine", "Methionine", "Phenylalanine", "Threonine", "Tryptophan", "Valine"
}

# Iterate over all files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        url = os.path.join(folder_path, filename)
        c = 0

        # Extract food name from filename
        match = re.search(r'([^/]+)(?=_\d+\.csv$)', filename)
        if match:
            result = match.group(1)

        # Read the CSV file
        with open(url, mode='r') as file:
            csvFile = csv.reader(file)
            next(csvFile)  # Skip header

            # Track nutrients and triples
            nutrient_tracker = set()
            nutrient_triples = []  # Temporary storage for triples

            food_uri = URIRef(ex[result])

            # Iterate over CSV rows
            for lines in csvFile:
                category_name = lines[0]
                if category_name not in {"Macronutrients", "Essential Amino Acids"}:
                    continue

                value_name = lines[1]
                if not(separate_values_and_units_boolean(lines[2])):
                    measureUnity = separate_values_and_units(lines[2])
                    measure = float(measureUnity[0][0])
                    measure_unit = measureUnity[0][1]
                    daily_percentage = float(separate_values_and_units(lines[3])[0][0])

                    if value_name in REQUIRED_NUTRIENTS:
                        nutrient_tracker.add(value_name)

                        feature_uri = ex[Match_data_with_csv[c][0]]
                        nutrient_uri = generate_unique_iri(ex, food_uri, value_name, measure)

                        # Store triples in a list first
                        nutrient_triples.extend([
                            (nutrient_uri, RDF.type, feature_uri),
                            (nutrient_uri, ex.hasWeight, Literal(measure, datatype=XSD.float)),
                            (nutrient_uri, ex.hasUnit, Literal(measure_unit, datatype=XSD.string)),
                            (nutrient_uri, ex.hasDailyValue, Literal(daily_percentage, datatype=XSD.float)),
                            (food_uri, ex.hasNutrient, nutrient_uri)
                        ])

            # Only add if all required nutrients are present
            if REQUIRED_NUTRIENTS.issubset(nutrient_tracker):
                correct_count+=1
                g.add((food_uri, RDF.type, ex.Food))
                g.add((food_uri, ex.hasName, Literal(result, datatype=XSD.string)))

                # Add all stored triples
                for triple in nutrient_triples:
                    g.add(triple)
            else:
                # Remove stored triples if food does not meet the criteria
                for triple in nutrient_triples:
                    g.remove(triple)

    count += 1
    if count % 1000 == 0:
        break

print(correct_count)
# 80806 values in the end

30


# Checking the graph nodes and edges

In [26]:
food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"  # Adjust to actual node name
food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"

if food_name in G:
    neighbors = list(G.neighbors(food_name))
    print(f"Food Item: {food_name} has relationships with: {neighbors}")
else:
    print("Food not found in graph.")

Food Item: http://example.org/-_Annies_Dressing_Tuscany_Italian has relationships with: ['http://example.org/Leucine_6ce69b8b17', 'http://example.org/Histidine_92f37625a5', '-_Annies_Dressing_Tuscany_Italian', 'http://example.org/Lysine_3caa5fab81', 'http://example.org/Protein_c91116ed2a', 'http://example.org/Phenylalanine_ecd093668f', 'http://example.org/Isoleucine_e8993410f0', 'http://example.org/Threonine_984a59cb3d', 'http://example.org/Valine_946d5dd270', 'http://example.org/Tryptophan_e50cbab6b1', 'http://example.org/Food', 'http://example.org/Calories_(kcal)_838b9ec3f4', 'http://example.org/Methionine_d6f42e16e0']


In [9]:
food_name = "http://example.org/Rb__Confections_Lc__-_Mint_Nonpareils"

if food_name in g:
    neighbors = list(g.neighbors(food_name))
    print(f"Food Item: {food_name} has relationships with: {neighbors}")

ValueError: too many values to unpack (expected 3)

In [27]:
import re

food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"  # Adjust to actual node name
search_nutrient = "Leucine"  # User input

if food_name in G:
    neighbors = list(G.neighbors(food_name))

    # Extract only the nutrient name (remove prefix and suffix)
    pattern = r"http://example\.org/([^_]+)_.*"  # Extracts the part between '/' and '_'

    matched_nutrient = [
        n for n in neighbors
        if re.match(pattern, n) and re.match(pattern, n).group(1).lower() == search_nutrient.lower()
    ]

    if matched_nutrient:
        print(f"Food Item: {food_name} has the following '{search_nutrient}' relationships:")
        print(matched_nutrient)

        # Print the neighbors of the matched nutrient
        for nutrient in matched_nutrient:
            if nutrient in G:
                nutrient_neighbors = list(G.neighbors(nutrient))
                print(f"\nNutrient: {nutrient} has relationships with:")
                print(nutrient_neighbors)
            else:
                print(f"\nNo relationships found for {nutrient}.")
    else:
        print(f"No match found for '{search_nutrient}' in {food_name}'s relationships.")
else:
    print("Food not found in graph.")


Food Item: http://example.org/-_Annies_Dressing_Tuscany_Italian has the following 'Leucine' relationships:
['http://example.org/Leucine_6ce69b8b17']

Nutrient: http://example.org/Leucine_6ce69b8b17 has relationships with:
['mg', '0.0', 'http://example.org/Weight']


In [29]:
food_name = "http://example.org/Rb__Confections_Lc__-_Mint_Nonpareils"  # Adjust to actual node name

if food_name in G:
    edges = []

    for _, target, edge_data in G.edges(food_name, data=True):  # ✅ Loop only once
        edge_label = edge_data.get("label", "No label")
        edges.append((food_name, edge_label, target))

    # Print the edges
    print(f"Edges connected to {food_name}:")
    for source, label, target in edges:
        print(f"{source} --[{label}]--> {target}")
else:
    print("Food not found in graph.")


Food not found in graph.


In [30]:
for i in g:
    print(i)

(rdflib.term.URIRef('http://example.org/VitaminB6'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/TotalPolyunsaturated'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/MargaricAcid'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/unite'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://example.org/Lycopene'))
(rdflib.term.URIRef('http://example.org/Weight'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://example.org/transPalmitoleicAcid'))
(rdflib.term.URIRef('http://example.org/Threonine'), rdflib.term.URIRef('http://ww

# Counting the number of duplicated products

In [1]:
import os
import re
from collections import defaultdict


def process_file_names_with_counter(folder_path, output_file):
    # Dictionary to count occurrences of cleaned names
    name_counter = defaultdict(int)

    # Open the output file
    with open(output_file, "w") as file:
        # Iterate through all files in the folder
        for filename in os.listdir(folder_path):
            # Remove the extension to work only on the name
            name, ext = os.path.splitext(filename)

            # Use regex to match the name without the trailing number
            match = re.match(r"(.*?)[0-9]+$", name)

            if match:
                # Extract the part before the number
                cleaned_name = match.group(1)
            else:
                # If no number at the end, keep the original name
                cleaned_name = name

            # Increment the count for this cleaned name
            name_counter[cleaned_name] += 1

        # Sort the names by their counts in descending order
        sorted_names = sorted(name_counter.items(), key=lambda item: item[1], reverse=True)

        # Write the sorted count of similar cleaned names to the file
        file.write("\nCounts of similar cleaned names (sorted):\n")
        for name, count in sorted_names:
            file.write(f"{name}: {count}\n")

    print(f"Output written to {output_file}")


# Example usage
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"
output_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Cleaning_steps/output.txt"
process_file_names_with_counter(folder_path, output_file)

#39031 have at least 2 duplicates
# Milk_Whole_ has 247 duplicates

Output written to C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Cleaning_steps/output.txt


# Count how many items have an undefined calorie amount

In [2]:
def separate_values_and_units_boolean(text):
    if(text == '' or text == '--%'):
        return True
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return True

    return False

In [3]:
import os
import csv


# Function to count files where "calories" is not defined
def count_files_with_undefined_calories(folder_path):
    undefined_calories_count = 0
    files_checked = 0

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):  # Only process CSV files
            file_path = os.path.join(folder_path, file_name)
            files_checked += 1
            calories_defined = False  # Tracks if "calories" are defined in the file

            # Open and read the CSV file
            with open(file_path, mode='r') as file:
                csv_reader = csv.reader(file)

                # Skip the header row
                next(csv_reader)

                # Check each row in the file
                for row in csv_reader:
                    if len(row) < 3:
                        continue  # Skip rows with missing columns

                    value_name = row[1].strip()  # Second column for "value_name"
                    #measure = row[2].strip()  # Third column for "measure" value
                    measureUnity = separate_values_and_units_boolean(row[2])

                    if value_name.lower() == "calories (kcal)":
                        if measureUnity:  # If measure is not empty or invalid
                            calories_defined = False
                        else:
                            calories_defined = True
                        break  # Stop checking further since we found "calories"

            # Increment the count if calories are not defined
            if not calories_defined:
                undefined_calories_count += 1

    print(f"Checked {files_checked} files.")
    return undefined_calories_count


# Specify the folder containing the CSV files
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"

# Call the function and print the result
undefined_count = count_files_with_undefined_calories(folder_path)
print(f"Number of files where 'Calories' is not defined: {undefined_count}")

# Checked 327139 files.
# Number of files where 'Calories' is not defined: 28152

Checked 327139 files.
Number of files where 'Calories' is not defined: 28152


# Count the number of products with a complete Essential amino acids profil

In [4]:
import os
import csv

# List of required essential amino acids
essential_amino_acids = [
    "Histidine", "Isoleucine", "Leucine", "Lysine",
    "Methionine", "Phenylalanine", "Threonine",
    "Tryptophan", "Valine"
]


# Function to count products with all essential amino acids values filled
def count_products_with_all_essential_amino_acids(folder_path):
    products_with_all_essential_amino_acids = 0
    files_checked = 0

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):  # Only process CSV files
            file_path = os.path.join(folder_path, file_name)
            files_checked += 1
            essential_acids_status = {acid: False for acid in essential_amino_acids}

            # Open and read the CSV file
            with open(file_path, mode='r') as file:
                csv_reader = csv.reader(file)

                # Skip the header row
                next(csv_reader)

                # Check each row in the file
                for row in csv_reader:
                    if len(row) < 3:
                        continue  # Skip rows with missing columns

                    category_name = row[0].strip()  # First column for "category_name"
                    value_name = row[1].strip()  # Second column for "value_name"
                    measure = row[2].strip()  # Third column for "measure"
                    measureUnity = separate_values_and_units_boolean(row[2])

                    # Track if the current row corresponds to an essential amino acid entry
                    if category_name.lower() == "essential amino acids" and value_name in essential_amino_acids:
                        if measureUnity:  # If measure is non-empty, mark the amino acid as filled
                            essential_acids_status[value_name] = False
                            break
                        else:
                            essential_acids_status[value_name] = True

            # If all essential amino acids are filled, count this product
            if all(essential_acids_status.values()):

                products_with_all_essential_amino_acids += 1
                #print(file_name)

            if(files_checked%10000 == 0):
                print(files_checked)

    print(f"Checked {files_checked} files.")
    return products_with_all_essential_amino_acids


# Specify the folder containing the CSV files
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"

# Call the function and print the result
filled_count = count_products_with_all_essential_amino_acids(folder_path)
print(f"Number of products with all essential amino acids values filled: {filled_count}")

# Checked 327139 files.
# Number of products with all essential amino acids values filled: 83375

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
Checked 327139 files.
Number of products with all essential amino acids values filled: 83375


# SPARSQL Queries

## An average human of 70kg should have a daily intake of essential amino acids of : Histidine: 980 mg Isoleucine: 1330 mg Leucine: 2940 mg Lysine: 2660 mg Methionine: 1330 mg Phenylalanine: 2310 mg Threonine: 1400 mg Tryptophan: 350 mg Valine: 1680 mg

https://my.clevelandclinic.org/health/articles/22243-amino-acids

In [41]:
import csv

# Define required minimum values for a "complete" amino acid profile (in mg)
REQUIRED_AMINO_ACIDS = {
    "Histidine": 980,
    "Isoleucine": 1330,
    "Leucine": 2940,
    "Lysine": 2660,
    "Methionine": 1330,
    "Phenylalanine": 2310,
    "Threonine": 1400,
    "Tryptophan": 350,
    "Valine": 1680
}

# Read the food amino acids CSV file
file_path = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids_cleaned.csv"
complete_proteins = []
count_complete_proteins = 0
incomplete_proteins = []
count_incomplete_proteins = 0

with open(file_path, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Read the header row

    # Get column indices of essential amino acids
    amino_acid_indices = {aa: header.index(aa) for aa in REQUIRED_AMINO_ACIDS.keys()}

    # Check each food item
    for row in reader:
        food_name = row[0]
        amino_acid_values = {aa: float(row[amino_acid_indices[aa]]) for aa in REQUIRED_AMINO_ACIDS.keys()}

        # Check if this food meets or exceeds all required amino acids
        if all(amino_acid_values[aa] >= REQUIRED_AMINO_ACIDS[aa] for aa in REQUIRED_AMINO_ACIDS.keys()):
            complete_proteins.append(food_name)
            count_complete_proteins += 1
        else:
            incomplete_proteins.append(food_name)
            count_incomplete_proteins += 1

# Print results
print("Foods with a complete amino acid profile:")
if complete_proteins:
    for food in complete_proteins:
        print(f"- {food}")
    print(f"Total foods with a complete amino acid profile: {count_complete_proteins}")
else:
    print("No individual foods meet the complete amino acid profile.")

print("\nFoods with an incomplete amino acid profile:")
if incomplete_proteins:
    for food in incomplete_proteins:
        print(f"- {food}")
    print(f"Total foods with an incomplete amino acid profile: {count_incomplete_proteins}")
else:
    print("All foods are complete proteins.")


Foods with a complete amino acid profile:
- Apples_Canned_Sweetened_Sliced_Drained_Unheated
- Cream_Of_Rice_Cooked_With_Water_Without_Salt
- Dried_Eggs
- Dried_Salted_Atlantic_Cod
- Egg_white_dried
- Egg_white_dried_flakes_stabilized_glucose_reduced
- Egg_white_dried_powder_stabilized_glucose_reduced
- Egg_White_Powder
- Egg_whole_dried_stabilized_glucose_reduced
- Fish_whitefish_dried__Alaska_Native_
- Nutribiotic_Systems_-_Rice_Protein
- Seal_bearded__Oogruk__meat_dried__Alaska_Native_
- Soup_Swanson_Chicken_Broth_99__Fat_Free
- Tangerine_Juice_Canned_Sweetened
- Vital_wheat_gluten
- Whale_beluga_meat_dried__Alaska_Native_
Total foods with a complete amino acid profile: 16

Foods with an incomplete amino acid profile:
- Acorn_stew__Apache_
- Agave_raw__Southwest_
- Alaskan_King_Crab
- All_Purpose_Flour__Enrinched_and_Unbleached_
- Almonds
- Almond_Butter
- Amaranth_grain_uncooked
- Amaranth_leaves_cooked_boiled_drained_without_salt
- Amaranth_leaves_cooked_boiled_drained_with_salt
- 

In [62]:
import csv
import itertools

# Define essential amino acids and their required minimum values (in mg)
REQUIRED_AMINO_ACIDS = {
    "Histidine": 980,
    "Isoleucine": 1330,
    "Leucine": 2940,
    "Lysine": 2660,
    "Methionine": 1330,
    "Phenylalanine": 2310,
    "Threonine": 1400,
    "Tryptophan": 350,
    "Valine": 1680
}

# Read the food amino acids CSV file
file_path = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids_cleaned.csv"
food_data = []

with open(file_path, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Read the header row

    # Get column indices of essential amino acids
    amino_acid_indices = {aa: header.index(aa) for aa in REQUIRED_AMINO_ACIDS.keys()}

    # Read each row (food item)
    for row in reader:
        food_name = row[0]
        amino_acid_values = {aa: float(row[amino_acid_indices[aa]]) for aa in REQUIRED_AMINO_ACIDS.keys()}
        food_data.append((food_name, amino_acid_values))

# Find up to 10 valid combinations
valid_combinations = []
for food1, food2 in itertools.combinations(food_data, 2):
    name1, amino_acids1 = food1
    name2, amino_acids2 = food2

    # Combine the amino acid values of the two foods
    combined_amino_acids = {aa: amino_acids1[aa] + amino_acids2[aa] for aa in REQUIRED_AMINO_ACIDS.keys()}

    # Check if the combination meets or exceeds all required amounts
    if all(combined_amino_acids[aa] >= REQUIRED_AMINO_ACIDS[aa] for aa in REQUIRED_AMINO_ACIDS.keys()):
        valid_combinations.append((name1, name2, combined_amino_acids))

        # Stop if we have found 10 combinations
        if len(valid_combinations) == 10000:
            break

#Print the results
if valid_combinations:
    print(f"Found {len(valid_combinations)} valid food combinations:\n")
    for i, (name1, name2, combined_amino_acids) in enumerate(valid_combinations, 1):
        print(f"Combination {i}: {name1} + {name2}")
        for aa, value in combined_amino_acids.items():
            print(f"  {aa}: {value:.2f} mg")
        print("\n" + "-" * 40 + "\n")
else:
    print("No valid food combinations found.")
#print(len(valid_combinations))


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




  Histidine: 1529.00 mg
  Isoleucine: 4814.00 mg
  Leucine: 8170.00 mg
  Lysine: 7001.00 mg
  Methionine: 1646.00 mg
  Phenylalanine: 2588.00 mg
  Threonine: 5737.00 mg
  Tryptophan: 1090.00 mg
  Valine: 4319.00 mg

----------------------------------------

Combination 8972: Beans_french_mature_seeds_cooked_boiled_without_salt + Seal_bearded__Oogruk__meat_dried__Alaska_Native_
  Histidine: 3143.00 mg
  Isoleucine: 3445.00 mg
  Leucine: 6949.00 mg
  Lysine: 7174.00 mg
  Methionine: 1790.00 mg
  Phenylalanine: 3656.00 mg
  Threonine: 3525.00 mg
  Tryptophan: 878.00 mg
  Valine: 3597.00 mg

----------------------------------------

Combination 8973: Beans_french_mature_seeds_cooked_boiled_without_salt + Seal_bearded__Oogruk__meat_partially_dried__Alaska_Native_
  Histidine: 2422.00 mg
  Isoleucine: 2678.00 mg
  Leucine: 5385.00 mg
  Lysine: 5536.00 mg
  Methionine: 1378.00 mg
  Phenylalanine: 2854.00 mg
  Threonine: 2735.00 mg
  Tryptophan: 684.00 mg
  Valine: 2807.00 mg

---------------

In [47]:
import csv

# Define required minimum values for essential amino acids (in mg)
REQUIRED_AMINO_ACIDS = {
    "Histidine": 980,
    "Isoleucine": 1330,
    "Leucine": 2940,
    "Lysine": 2660,
    "Methionine": 1330,
    "Phenylalanine": 2310,
    "Threonine": 1400,
    "Tryptophan": 350,
    "Valine": 1680
}

# File paths
file_path = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids_cleaned.csv"
output_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/foods_with_missing_amino_acids.csv"

# Store foods that lack certain amino acids
foods_with_deficiencies = []

# Read the CSV file
with open(file_path, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Read the header row

    # Get column indices of essential amino acids
    amino_acid_indices = {aa: header.index(aa) for aa in REQUIRED_AMINO_ACIDS.keys()}

    # Check each food item
    for row in reader:
        food_name = row[0]
        amino_acid_values = {aa: float(row[amino_acid_indices[aa]]) for aa in REQUIRED_AMINO_ACIDS.keys()}

        # Identify missing (insufficient) amino acids
        lacking_amino_acids = [aa for aa in REQUIRED_AMINO_ACIDS.keys() if amino_acid_values[aa] < REQUIRED_AMINO_ACIDS[aa]]

        if lacking_amino_acids:
            foods_with_deficiencies.append([food_name] + lacking_amino_acids)

# Save results to a new CSV file
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Food Name", "Lacking Amino Acids"])  # Header
    writer.writerows(foods_with_deficiencies)

# Print results
print(f"Foods with missing amino acids saved to '{output_file}'.")


Foods with missing amino acids saved to 'C:/Users/laure/OneDrive/Documents/GitHub/KG_project/foods_with_missing_amino_acids.csv'.


In [48]:
import csv
from collections import Counter

# Define required minimum values for essential amino acids (in mg)
REQUIRED_AMINO_ACIDS = {
    "Histidine": 980,
    "Isoleucine": 1330,
    "Leucine": 2940,
    "Lysine": 2660,
    "Methionine": 1330,
    "Phenylalanine": 2310,
    "Threonine": 1400,
    "Tryptophan": 350,
    "Valine": 1680
}

# File paths
input_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/foods_with_missing_amino_acids.csv"
output_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/amino_acid_missing_counts.csv"

# Initialize a counter for missing amino acids
missing_amino_acids_counter = Counter()

# Read the CSV file with foods that have missing amino acids
with open(input_file, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header row

    # Iterate through each food item row
    for row in reader:
        food_name = row[0]  # Food name is the first column
        lacking_amino_acids = row[1:]  # The rest are the lacking amino acids

        # Remove empty strings or any extra spaces
        lacking_amino_acids = [aa for aa in lacking_amino_acids if aa.strip()]

        # Update the counter with each missing amino acid
        missing_amino_acids_counter.update(lacking_amino_acids)

# Save the amino acid missing counts to a new CSV file
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Amino Acid", "Count"])  # Header row
    for amino_acid, count in missing_amino_acids_counter.items():
        writer.writerow([amino_acid, count])

# Print results
print(f"Missing amino acid counts saved to '{output_file}'.")

Missing amino acid counts saved to 'C:/Users/laure/OneDrive/Documents/GitHub/KG_project/amino_acid_missing_counts.csv'.


In [54]:
import csv

# Define the required essential amino acids (in mg)
REQUIRED_AMINO_ACIDS = [
    "Histidine", "Isoleucine", "Leucine", "Lysine", "Methionine",
    "Phenylalanine", "Threonine", "Tryptophan", "Valine"
]

# Define the threshold for high protein (in grams)
PROTEIN_THRESHOLD = 10  # Protein above 10 grams

# File path to the CSV with the amino acid profiles
input_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids.csv"

# Read the file and process the foods
with open(input_file, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header row

    # Find the column index for "Protein" and other amino acids
    protein_index = header.index("Protein")  # Assuming "Protein" column is labeled as "Protein"
    amino_acid_indices = {aa: header.index(aa) for aa in REQUIRED_AMINO_ACIDS}

    # Go over each row and filter foods with high protein but missing amino acids
    foods_with_missing_amino_acids = []

    for row in reader:
        food_name = row[0]  # Food name is the first column
        protein_amount = float(row[protein_index])  # Protein amount in grams

        # Check if the food has protein above the threshold
        if protein_amount > PROTEIN_THRESHOLD:
            # Check which amino acids are missing
            lacking_amino_acids = [aa for aa in REQUIRED_AMINO_ACIDS if not row[amino_acid_indices[aa]].strip()]

            if lacking_amino_acids:
                # Save food name and missing amino acids
                food_info = {
                    "name": food_name,
                    "protein": protein_amount,
                    "lacking_amino_acids": lacking_amino_acids
                }
                foods_with_missing_amino_acids.append(food_info)

# Print the results
if foods_with_missing_amino_acids:
    print(f"Foods with more than {PROTEIN_THRESHOLD}g of protein but missing amino acids:\n")
    for food in foods_with_missing_amino_acids:
        print(f"Food: {food['name']}, Protein: {food['protein']}g, Missing Amino Acids: {', '.join(food['lacking_amino_acids'])}")
else:
    print(f"No foods with more than {PROTEIN_THRESHOLD}g of protein and missing amino acids found.")


No foods with more than 10g of protein and missing amino acids found.


In [60]:
import csv

# File path to the cleaned amino acids CSV
input_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids_cleaned.csv"

# Initialize dictionaries to store best and worst values
best_acids = {}
worst_acids = {}

# Read the file and process the amino acid data
with open(input_file, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    header = next(reader)  # Get the header

    # Initialize best and worst tracking dictionaries
    for acid in header[3:]:  # Assuming amino acids columns are after the third column
        best_acids[acid] = {"name": None, "value": float("-inf")}
        worst_acids[acid] = {"name": None, "value": float("inf")}

    for row in reader:
        food_name = row[0]  # Food name is the first column
        protein_value = float(row[3])  # Assuming protein content is in the second column

        if protein_value >= 10:
            for index, acid in enumerate(header[3:], start=3):
                try:
                    value = float(row[index])
                    if value <= 50000:
                        if value > best_acids[acid]["value"]:
                            best_acids[acid] = {"name": food_name, "value": value}
                        if value < worst_acids[acid]["value"]:
                            worst_acids[acid] = {"name": food_name, "value": value}
                except ValueError:
                    continue  # Skip invalid or missing values

# Print the results
for acid in header[3:]:
    print(f"{acid}:")
    print(f"  Best: {best_acids[acid]['name']} - {best_acids[acid]['value']}")
    print(f"  Worst: {worst_acids[acid]['name']} - {worst_acids[acid]['value']}")


Protein:
  Best: Soy_protein_isolate_potassium_type - 88.3
  Worst: Bread_stuffing_cornbread_dry_mix - 10.0
Histidine:
  Best: Whale_beluga_meat_dried__Alaska_Native_ - 2999.0
  Worst: Pork_cured_feet_pickled - 127.97356828193833
Isoleucine:
  Best: Egg_white_dried_powder_stabilized_glucose_reduced - 5028.571428571428
  Worst: Onion_Powder - 142.85714285714286
Leucine:
  Best: Rexall_Sundown_Inc__-_Whey_Protein_Powder - 7606.999999999999
  Worst: Onion_Powder - 214.28571428571428
Lysine:
  Best: Seal_bearded__Oogruk__meat_dried__Alaska_Native_ - 6690.000000000001
  Worst: Beverages_coffee_instant_decaffeinated_powder - 92.0
Methionine:
  Best: Egg_white_dried_powder_stabilized_glucose_reduced - 3200.0
  Worst: Beverages_coffee_instant_decaffeinated_powder - 22.0
Phenylalanine:
  Best: Egg_white_dried_powder_stabilized_glucose_reduced - 5185.714285714285
  Worst: Barley_malt_flour - 225.0
Threonine:
  Best: Rexall_Sundown_Inc__-_Whey_Protein_Powder - 5440.0
  Worst: Cooked_Wild_Eastern_