In [2]:
import rdflib
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal, XSD

# Create graph
g = Graph()

# Define Namespaces
schema  = Namespace("http://schema.org/")
rdf     = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
rdfs    = Namespace("http://www.w3.org/2000/01/rdf-schema#")
owl     = Namespace("http://www.w3.org/2002/07/owl#")
xsd     = Namespace("http://www.w3.org/2001/XMLSchema#")
fo      = Namespace("https://purl.org/ontology/fo/")
dbr     = Namespace("http://dbpedia.org/resource/")
ex      = Namespace("http://example.org/")
foodon = Namespace("http://purl.obolibrary.org/obo/FOODON_")
wikidata = Namespace("http://www.wikidata.org/entity/")
OBO = Namespace("http://purl.obolibrary.org/obo/")

g.bind("schema", schema)
g.bind("rdf", rdf)
g.bind("rdfs", rdfs)
g.bind("owl", owl)
g.bind("xsd", xsd)
g.bind("fo", fo)
g.bind("dbr",dbr)
g.bind("ex", ex)
g.bind("obo", OBO)
g.bind("wikidata", wikidata)

# List of subclass relationships
subclass_relationships = [
    # Subclass Macronutrients
    ("Weight","Macronutrients"),("Calories","Macronutrients"),("Fat","Macronutrients"),("Protein","Macronutrients"),("Carbohydrate","Macronutrients"),("Sugars","Macronutrients"),("Fiber","Macronutrients"),("Cholesterol","Macronutrients"),("SaturatedFats","Macronutrients"),("NetCarbs","Macronutrients"),("TransFattyAcids","Macronutrients"),

    # Subclass Minerals
    ("Calcium", "Minerals"), ("Iron", "Minerals"), ("Potassium", "Minerals"),
    ("Magnesium", "Minerals"), ("Phosphorus", "Minerals"), ("Sodium", "Minerals"),
    ("Zinc", "Minerals"), ("Copper", "Minerals"), ("Manganese", "Minerals"),
    ("Selenium", "Minerals"), ("Fluoride", "Minerals"), ("Molybdenum", "Minerals"),
    ("Iodine", "Minerals"), ("Chloride", "Minerals"), ("Chromium", "Minerals"),

    # Subclass Vitamins
    ("VitaminA", "Vitamins"), ("VitaminC", "Vitamins"), ("ThiaminB1", "Vitamins"),
    ("RiboflavinB2", "Vitamins"), ("NiacinB3", "Vitamins"), ("VitaminB5", "Vitamins"),
    ("VitaminB6", "Vitamins"), ("Biotin", "Vitamins"), ("Folate", "Vitamins"),
    ("Folicacid", "Vitamins"), ("Foodfolate", "Vitamins"), ("FolateDFE", "Vitamins"),
    ("Choline", "Vitamins"), ("VitaminB12", "Vitamins"), ("Retinol", "Vitamins"),
    ("Carotenebeta", "Vitamins"), ("Carotenealpha", "Vitamins"),
    ("Cryptoxanthinbeta", "Vitamins"), ("VitaminAIU", "Vitamins"),
    ("Lycopene", "Vitamins"), ("LutZeaxanthin", "Vitamins"), ("VitaminE", "Vitamins"),
    ("VitaminD", "Vitamins"), ("VitaminD2", "Vitamins"), ("VitaminD3", "Vitamins"),
    ("VitaminDIU", "Vitamins"), ("VitaminK", "Vitamins"), ("VitaminK1", "Vitamins"),
    ("Menaquinone4", "Vitamins"),

    # Subclass Others
    ("Water", "OtherComponent"), ("Ash", "OtherComponent"), ("Alcohol", "OtherComponent"),
    ("Caffeine", "OtherComponent"), ("Theobromine", "OtherComponent"), ("PRALscore", "OtherComponent"),

    # Subclass Carbohydrates
    ("SolubleFiber", "Carbohydrates"), ("InsolubleFiber", "Carbohydrates"), ("AddedSugar", "Carbohydrates"),
    ("Sucrose", "Carbohydrates"), ("Glucose", "Carbohydrates"), ("Fructose", "Carbohydrates"),
    ("Lactose", "Carbohydrates"), ("Maltose", "Carbohydrates"), ("Galactose", "Carbohydrates"),
    ("Strarch", "Carbohydrates"), ("otherSugar", "Carbohydrates"), ("SugarAlcohols", "Carbohydrates"),

    # Subclass Fats
    ("TotalMonounsaturated","Fats"),("TotalPolyunsaturated","Fats"),("Omega3s","Fats"),("Omega6s","Fats"),("Omega3toOmega6Ratio","Fats"),("Omega6toOmega3Ratio","Fats"),

    # Subclass Omega3Fat
    ("alphaLinolenicAcid", "Omega3Fat"), ("EicosapentaenoicAcid", "Omega3Fat"), ("DocosapentaenoicAcid", "Omega3Fat"), ("DocosahexaenoicAcid", "Omega3Fat"), ("n3EicosatrienoicAcid", "Omega3Fat"),

    # Subclass Omega6Fat
    ("alphaLinolenicAcid", "Omega6Fat"),("gammaLinolenicAcid", "Omega6Fat"),("ciscisn6EicosadienoicAcid", "Omega6Fat"),("DihomogammalinolenicAcid", "Omega6Fat"),("ArachidonicAcid", "Omega6Fat"),("Butyric", "Omega6Fat"),("Caproic", "Omega6Fat"),("CaprylicAcid", "Omega6Fat"), ("CapricAcid", "Omega6Fat"), ("LauricAcid", "Omega6Fat"), ("TridecanoicAcid", "Omega6Fat"), ("MyristicAcid", "Omega6Fat"),("PentadecanoicAcid", "Omega6Fat"),("PalmiticAcid", "Omega6Fat"),("MargaricAcid", "Omega6Fat"),("StearicAcid", "Omega6Fat"),("ArachidicAcid", "Omega6Fat"),("BehenicAcid", "Omega6Fat"),("LignocericAcid", "Omega6Fat"),

    # Subclass Monounsaturated
    ("MyristoleicAcid", "MonounsaturatedFat"), ("PentadecenoicAcid", "MonounsaturatedFat"), ("PalmitoleicAcid", "MonounsaturatedFat"),
    ("cisPalmitoleicAcid", "MonounsaturatedFat"), ("HeptadecenoicAcid", "MonounsaturatedFat"), ("OleicAcid", "MonounsaturatedFat"),
    ("cisOleicAcid", "MonounsaturatedFat"), ("cisVaccenicAcid", "MonounsaturatedFat"), ("GadoleicAcid", "MonounsaturatedFat"), ("DocosenoicAcid", "MonounsaturatedFat"), ("cisDocosenoicAcid", "MonounsaturatedFat"), ("cisTetracosenoicAcid", "MonounsaturatedFat"),

    # Subclass Polyunsaturated
    ("OctadecadienoicAcid","PolyunsaturatedFat"),("ConjugatedLinoleicAcid","PolyunsaturatedFat"),("iOctadecadienoicAcid","PolyunsaturatedFat"),("undifferentiatedOctadecatrienoic","PolyunsaturatedFat"),("OctadecatrienoicAcid","PolyunsaturatedFat"),("ParinaricAcid","PolyunsaturatedFat"),("EicosatrienoicAcid","PolyunsaturatedFat"),("EicosatetraenoicAcid","PolyunsaturatedFat"),("HeneicosapentaenoicAcid","PolyunsaturatedFat"),("22:4","PolyunsaturatedFat"),

    # Subclass TransFat
    ("TransPolyenoicFats","TransFat"),("TransMonoenoicFats","TransFat"),("transPalmitoleicAcid","TransFat"),("transOleicAcid","TransFat"),("transDocosenoicAcid","TransFat"),("transOctadecadienoicAcid","TransFat"),("transtransOctadecadienoicAcid","TransFat"),

    # Subclass Phytosterols
    ("TotalPhytosterols","Phytosterols"),("Stigmasterol","Phytosterols"),("Campesterol","Phytosterols"),("Betasitosterol","Phytosterols"),

    # Subclass Fats
    ("Omega3Fat", "Fats"), ("Omega6Fat", "Fats"), ("MonounsaturatedFat", "Fats"),("PolyunsaturatedFat", "Fats"), ("TransFat", "Fats"), ("Phytosterols", "Fats"),

    # Subclass EssentialAminoAcid
    ("Histidine","EssentialAminoAcid"),("Isoleucine","EssentialAminoAcid"),("Leucine","EssentialAminoAcid"),("Lysine","EssentialAminoAcid"),("Methionine","EssentialAminoAcid"),("Phenylalanine","EssentialAminoAcid"),("Threonine","EssentialAminoAcid"),("Tryptophan","EssentialAminoAcid"),("Valine","EssentialAminoAcid"),

    # Subclass ConditionallyEssentialAminoAcid
    ("Arginine","ConditionallyEssentialAminoAcid"),("Cystine","ConditionallyEssentialAminoAcid"),("Glycine","ConditionallyEssentialAminoAcid"),("Proline","ConditionallyEssentialAminoAcid"),("Tyrosine","ConditionallyEssentialAminoAcid"),

    # Subclass NonEssentialAminoAcid
    ("Alanine","NonEssentialAminoAcid"),("Asparticacid","NonEssentialAminoAcid"),("Betaine","NonEssentialAminoAcid"),("Glutamicacid","NonEssentialAminoAcid"),("Hydroxyproline","NonEssentialAminoAcid"),("Serine","NonEssentialAminoAcid"),

    # Subclass Amino Acids
    ("EssentialAminoAcid", "AminoAcids"),
    ("ConditionallyEssentialAminoAcid", "AminoAcids"),
    ("NonEssentialAminoAcid", "AminoAcids"),

    # General classes
    ("Macronutrients", "Food"),
    ("Minerals", "Food"),
    ("Vitamins", "Food"),
    ("OtherComponent", "Food"),
    ("Carbohydrates", "Food"),
    ("Fats", "Food"),
    ("AminoAcids", "Food"),
]

# Create and define each class
all_classes = set()
for subclass, superclass in subclass_relationships:
    subclass_uri = getattr(ex, subclass)
    superclass_uri = getattr(ex, superclass)

    # Declare both subclass and superclass as OWL classes
    if subclass not in all_classes:
        g.add((subclass_uri, RDF.type, OWL.Class))
        all_classes.add(subclass)
        g.add((ex.Weight, RDFS.subClassOf, subclass_uri))
        g.add((ex.DailyValue, RDFS.subClassOf, subclass_uri))
        g.add((ex.unite, RDFS.subClassOf, subclass_uri))

    if superclass not in all_classes:
        g.add((superclass_uri, RDF.type, OWL.Class))
        all_classes.add(superclass)

    # Establish hierarchy
    g.add((subclass_uri, RDFS.subClassOf, superclass_uri))

# Define data property for food name
g.add((ex.hasName, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasName, RDFS.range, xsd.string))

# Define data property for food weight (e.g., in grams)
g.add((ex.hasWeight, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasWeight, RDFS.range, xsd.float))

# Define data property for food weight (e.g., in grams)
g.add((ex.hasUnite, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasUnite, RDFS.range, xsd.string))

g.add((ex.hasDailyValue, RDF.type, OWL.DatatypeProperty))
g.add((ex.hasDailyValue, RDFS.range, xsd.float))

<Graph identifier=N5daa25b9db6b411ba422858d648e54ac (<class 'rdflib.graph.Graph'>)>

In [1]:
import rdflib
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, Literal, XSD

# Defining vitamins, amino acids, minerals, etc from existing ontologies

Macronutrients

In [3]:
calories = URIRef(OBO["NCIT_C15646"])
fats = URIRef(OBO["NCIT_C15224"])
protein = URIRef(OBO["NCIT_C15920"])
carbohydrates = URIRef(OBO["NCIT_C15223"])
sugars = URIRef(OBO["NCIT_C71939"])
fiber = URIRef(OBO["NCIT_C15225"])
cholesterol = URIRef(OBO["NCIT_C15205"])
saturated_fats = URIRef(OBO["NCIT_C68421"])
net_carbs = URIRef(OBO["NCIT_C68492"])
trans_fatty_acids = URIRef(OBO["NCIT_C68440"])

In [4]:
g.add((ex.Calories, OWL.equivalentClass, calories))
g.add((ex.Fat, OWL.equivalentClass, fats))
g.add((ex.Protein, OWL.equivalentClass, protein))
g.add((ex.Carbohydrate, OWL.equivalentClass, carbohydrates))
g.add((ex.Sugars, OWL.equivalentClass, sugars))
g.add((ex.Fiber, OWL.equivalentClass, fiber))
g.add((ex.Cholesterol, OWL.equivalentClass, cholesterol))
g.add((ex.SaturatedFats, OWL.equivalentClass, saturated_fats))
g.add((ex.NetCarbs, OWL.equivalentClass, net_carbs))
g.add((ex.TransFattyAcids, OWL.equivalentClass, trans_fatty_acids))

<Graph identifier=N5daa25b9db6b411ba422858d648e54ac (<class 'rdflib.graph.Graph'>)>

Minerals

In [5]:
# calcium = URIRef(OBO["NCIT_C68241"])
# iron = URIRef(OBO["NCIT_C68256"])
# potassium = URIRef(OBO["NCIT_C68279"])
# magnesium = URIRef(OBO["NCIT_C68264"])
# phosphorus = URIRef(OBO["NCIT_C68277"])
# sodium = URIRef(OBO["NCIT_C68287"])
# zinc = URIRef(OBO["NCIT_C68295"])
# copper = URIRef(OBO["NCIT_C68249"])
# manganese = URIRef(OBO["NCIT_C68266"])
# selenium = URIRef(OBO["NCIT_C68281"])
# fluoride = URIRef(OBO["NCIT_C68252"])
# molybdenum = URIRef(OBO["NCIT_C68272"])
# iodine = URIRef(OBO["NCIT_C68254"])
# chloride = URIRef(OBO["NCIT_C68243"])
# chromium = URIRef(OBO["NCIT_C68245"])

In [6]:
# g.add((ex.Calcium, OWL.equivalentClass, calcium))
# g.add((ex.Iron, OWL.equivalentClass, iron))
# g.add((ex.Potassium, OWL.equivalentClass, potassium))
# g.add((ex.Magnesium, OWL.equivalentClass, magnesium))
# g.add((ex.Phosphorus, OWL.equivalentClass, phosphorus))
# g.add((ex.Sodium, OWL.equivalentClass, sodium))
# g.add((ex.Zinc, OWL.equivalentClass, zinc))
# g.add((ex.Copper, OWL.equivalentClass, copper))
# g.add((ex.Manganese, OWL.equivalentClass, manganese))
# g.add((ex.Selenium, OWL.equivalentClass, selenium))
# g.add((ex.Fluoride, OWL.equivalentClass, fluoride))
# g.add((ex.Molybdenum, OWL.equivalentClass, molybdenum))
# g.add((ex.Iodine, OWL.equivalentClass, iodine))
# g.add((ex.Chloride, OWL.equivalentClass, chloride))
# g.add((ex.Chromium, OWL.equivalentClass, chromium))

Vitamins

In [7]:
# vitaminA = URIRef(OBO["NCIT_C938"])
# vitaminC = URIRef(OBO["NCIT_C68507"])
# thiaminB1 = URIRef(OBO["NCIT_C874"])
# riboflavinB2 = URIRef(OBO["NCIT_C808"])
# niacinB3 = URIRef(OBO["NCIT_C689"])
# vitaminB5 = URIRef(OBO["NCIT_C47783"])
# vitaminB6 = URIRef(OBO["NCIT_C1334"])
# biotin = URIRef(OBO["NCIT_C309"])
# folate = URIRef(OBO["NCIT_C1444"])
# folicacid = URIRef(OBO["NCIT_C510"])
# foodfolate = URIRef(OBO["NCIT_C68512"])
# folateDFE = URIRef(OBO["NCIT_C68513"])
# choline = URIRef(OBO["NCIT_C61674"])
# vitaminB12 = URIRef(OBO["NCIT_C173805"])
# retinol = URIRef(OBO["NCIT_C68302"])
# carotenebeta = URIRef(OBO["NCIT_C1016"])
# carotenealpha = URIRef(OBO["NCIT_C68304"])
# cryptoxanthinbeta = URIRef(OBO["NCIT_C68306"])
# #vitaminAIU = URIRef(OBO[""])
# lycopene = URIRef(OBO["NCIT_C2226"])
# lutzeaxanthin = URIRef(OBO["NCIT_C68310"])
# vitaminE = URIRef(OBO["NCIT_C942"])
# vitaminD = URIRef(OBO["NCIT_C941"])
# vitaminD2 = URIRef(OBO["NCIT_C29029"])
# vitaminD3 = URIRef(OBO["NCIT_C48194"])
# #vitaminDIU = URIRef(OBO[""])
# vitaminK = URIRef(OBO["NCIT_C943"])
# vitaminK1 = URIRef(OBO["NCIT_C29365"])
# menaquinone4 = URIRef(OBO["NCIT_C68319"])

In [8]:
# g.add((ex.VitaminA, OWL.equivalentClass, vitaminA))
# g.add((ex.VitaminC, OWL.equivalentClass, vitaminC))
# g.add((ex.ThiaminB1, OWL.equivalentClass, thiaminB1))
# g.add((ex.RiboflavinB2, OWL.equivalentClass, riboflavinB2))
# g.add((ex.NiacinB3, OWL.equivalentClass, niacinB3))
# g.add((ex.VitaminB5, OWL.equivalentClass, vitaminB5))
# g.add((ex.VitaminB6, OWL.equivalentClass, vitaminB6))
# g.add((ex.Biotin, OWL.equivalentClass, biotin))
# g.add((ex.Folate, OWL.equivalentClass, folate))
# g.add((ex.Folicacid, OWL.equivalentClass, folicacid))
# g.add((ex.Foodfolate, OWL.equivalentClass, foodfolate))
# g.add((ex.FolateDFE, OWL.equivalentClass, folateDFE))
# g.add((ex.Choline, OWL.equivalentClass, choline))
# g.add((ex.VitaminB12, OWL.equivalentClass, vitaminB12))
# g.add((ex.Retinol, OWL.equivalentClass, retinol))
# g.add((ex.Carotenebeta, OWL.equivalentClass, carotenebeta))
# g.add((ex.Carotenealpha, OWL.equivalentClass, carotenealpha))
# g.add((ex.Cryptoxanthinbeta, OWL.equivalentClass, cryptoxanthinbeta))
# #g.add((ex.VitaminAIU, OWL.equivalentClass, vitaminAIU))
# g.add((ex.Lycopene, OWL.equivalentClass, lycopene))
# g.add((ex.LutZeaxanthin, OWL.equivalentClass, lutzeaxanthin))
# g.add((ex.VitaminE, OWL.equivalentClass, vitaminE))
# g.add((ex.VitaminD, OWL.equivalentClass, vitaminD))
# g.add((ex.VitaminD2, OWL.equivalentClass, vitaminD2))
# g.add((ex.VitaminD3, OWL.equivalentClass, vitaminD3))
# #g.add((ex.VitaminDIU, OWL.equivalentClass, vitaminDIU))
# g.add((ex.VitaminK, OWL.equivalentClass, vitaminK))
# g.add((ex.VitaminK1, OWL.equivalentClass, vitaminK1))
# g.add((ex.Menaquinone4, OWL.equivalentClass, menaquinone4))

Essential amino acids

In [9]:
histidine = URIRef(OBO["OMIT_0007798"])
isoleucine = URIRef(OBO["OMIT_0008647"])
leucine = URIRef(OBO["OMIT_0009021"])
lysine = URIRef(OBO["OMIT_0009315"])
methionine = URIRef(OBO["OMIT_0009774"])
phenylalanine = URIRef(OBO["OMIT_0011637"])
threonine = URIRef(OBO["OMIT_0014757"])
tryptophan = URIRef(OBO["OMIT_0015201"])
valine = URIRef(OBO["OMIT_0015467"])

In [10]:
g.add((ex.Histidine, OWL.equivalentClass, histidine))
g.add((ex.Isoleucine, OWL.equivalentClass, isoleucine))
g.add((ex.Leucine, OWL.equivalentClass, leucine))
g.add((ex.Lysine, OWL.equivalentClass, lysine))
g.add((ex.Methionine, OWL.equivalentClass, methionine))
g.add((ex.Phenylalanine, OWL.equivalentClass, phenylalanine))
g.add((ex.Threonine, OWL.equivalentClass, threonine))
g.add((ex.Tryptophan, OWL.equivalentClass, tryptophan))
g.add((ex.Valine, OWL.equivalentClass, valine))

<Graph identifier=N5daa25b9db6b411ba422858d648e54ac (<class 'rdflib.graph.Graph'>)>

# Generate graph

In [16]:
# Serialize and save the ontology
g.serialize("C:/Users/laure/OneDrive/Documents/GitHub/KG_project/nutrition_ontology.ttl", format="turtle")

<Graph identifier=N5daa25b9db6b411ba422858d648e54ac (<class 'rdflib.graph.Graph'>)>

In [17]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a NetworkX directed graph
G = nx.DiGraph()

# Add nodes and edges from RDF graph
for subj, pred, obj in g:
    G.add_node(str(subj), label=str(subj).split("/")[-1])  # Use last part of URI as label
    G.add_node(str(obj), label=str(obj).split("/")[-1])
    G.add_edge(str(subj), str(obj), label=str(pred).split("#")[-1])  # Use last part of predicate as edge label

# Draw the graph
nx.write_gexf(G, "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/ontology.gexf")

# Add items to the graph

In [11]:
def separate_values_and_units(text):
    if(text == '' or text == '--%'):
        return [(0.0,'%')]
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return [(0.0,'g')]
    pattern = r"(\d*\.?\d*)([a-zA-Z%]*)"
    matches = re.findall(pattern, text)
    return [(value.strip(), unit.strip()) for value, unit in matches if value or unit]

In [12]:
Match_data_with_csv = [
    # Subclass Macronutrients
    ("Weight","Macronutrients"),("Calories","Macronutrients"),("Fat","Macronutrients"),("Protein","Macronutrients"),("Carbohydrate","Macronutrients"),("Sugars","Macronutrients"),("Fiber","Macronutrients"),("Cholesterol","Macronutrients"),("SaturatedFats","Macronutrients"),("NetCarbs","Macronutrients"),("TransFattyAcids","Macronutrients"),

    # Subclass Minerals
    ("Calcium", "Minerals"), ("Iron", "Minerals"), ("Potassium", "Minerals"),
    ("Magnesium", "Minerals"), ("Phosphorus", "Minerals"), ("Sodium", "Minerals"),
    ("Zinc", "Minerals"), ("Copper", "Minerals"), ("Manganese", "Minerals"),
    ("Selenium", "Minerals"), ("Fluoride", "Minerals"), ("Molybdenum", "Minerals"),
    ("Iodine", "Minerals"), ("Chloride", "Minerals"), ("Chromium", "Minerals"),

    # Subclass Vitamins
    ("VitaminA", "Vitamins"), ("VitaminC", "Vitamins"), ("ThiaminB1", "Vitamins"),
    ("RiboflavinB2", "Vitamins"), ("NiacinB3", "Vitamins"), ("VitaminB5", "Vitamins"),
    ("VitaminB6", "Vitamins"), ("Biotin", "Vitamins"), ("Folate", "Vitamins"),
    ("Folicacid", "Vitamins"), ("Foodfolate", "Vitamins"), ("FolateDFE", "Vitamins"),
    ("Choline", "Vitamins"), ("VitaminB12", "Vitamins"), ("Retinol", "Vitamins"),
    ("Carotenebeta", "Vitamins"), ("Carotenealpha", "Vitamins"),
    ("Cryptoxanthinbeta", "Vitamins"), ("VitaminAIU", "Vitamins"),
    ("Lycopene", "Vitamins"), ("LutZeaxanthin", "Vitamins"), ("VitaminE", "Vitamins"),
    ("VitaminD", "Vitamins"), ("VitaminD2", "Vitamins"), ("VitaminD3", "Vitamins"),
    ("VitaminDIU", "Vitamins"), ("VitaminK", "Vitamins"), ("VitaminK1", "Vitamins"),
    ("Menaquinone4", "Vitamins"),

    # Subclass Others
    ("Water", "OtherComponent"), ("Ash", "OtherComponent"), ("Alcohol", "OtherComponent"),
    ("Caffeine", "OtherComponent"), ("Theobromine", "OtherComponent"), ("PRALscore", "OtherComponent"),

    # Subclass Carbohydrates
    ("SolubleFiber", "Carbohydrates"), ("InsolubleFiber", "Carbohydrates"), ("AddedSugar", "Carbohydrates"),
    ("Sucrose", "Carbohydrates"), ("Glucose", "Carbohydrates"), ("Fructose", "Carbohydrates"),
    ("Lactose", "Carbohydrates"), ("Maltose", "Carbohydrates"), ("Galactose", "Carbohydrates"),
    ("Strarch", "Carbohydrates"), ("otherSugar", "Carbohydrates"), ("SugarAlcohols", "Carbohydrates"),

    # Subclass Fats
    ("TotalMonounsaturated","Fats"),("TotalPolyunsaturated","Fats"),("Omega3s","Fats"),("Omega6s","Fats"),("Omega3toOmega6Ratio","Fats"),("Omega6toOmega3Ratio","Fats"),

    # Subclass Omega3Fat
    ("alphaLinolenicAcid", "Omega3Fat"), ("EicosapentaenoicAcid", "Omega3Fat"), ("DocosapentaenoicAcid", "Omega3Fat"), ("DocosahexaenoicAcid", "Omega3Fat"), ("n3EicosatrienoicAcid", "Omega3Fat"),

    # Subclass Omega6Fat
    ("alphaLinolenicAcid", "Omega6Fat"),("gammaLinolenicAcid", "Omega6Fat"),("ciscisn6EicosadienoicAcid", "Omega6Fat"),("DihomogammalinolenicAcid", "Omega6Fat"),("ArachidonicAcid", "Omega6Fat"),("Butyric", "Omega6Fat"),("Caproic", "Omega6Fat"),("CaprylicAcid", "Omega6Fat"), ("CapricAcid", "Omega6Fat"), ("LauricAcid", "Omega6Fat"), ("TridecanoicAcid", "Omega6Fat"), ("MyristicAcid", "Omega6Fat"),("PentadecanoicAcid", "Omega6Fat"),("PalmiticAcid", "Omega6Fat"),("MargaricAcid", "Omega6Fat"),("StearicAcid", "Omega6Fat"),("ArachidicAcid", "Omega6Fat"),("BehenicAcid", "Omega6Fat"),("LignocericAcid", "Omega6Fat"),

    # Subclass Monounsaturated
    ("MyristoleicAcid", "MonounsaturatedFat"), ("PentadecenoicAcid", "MonounsaturatedFat"), ("PalmitoleicAcid", "MonounsaturatedFat"),
    ("cisPalmitoleicAcid", "MonounsaturatedFat"), ("HeptadecenoicAcid", "MonounsaturatedFat"), ("OleicAcid", "MonounsaturatedFat"),
    ("cisOleicAcid", "MonounsaturatedFat"), ("cisVaccenicAcid", "MonounsaturatedFat"), ("GadoleicAcid", "MonounsaturatedFat"), ("DocosenoicAcid", "MonounsaturatedFat"), ("cisDocosenoicAcid", "MonounsaturatedFat"), ("cisTetracosenoicAcid", "MonounsaturatedFat"),

    # Subclass Polyunsaturated
    ("OctadecadienoicAcid","PolyunsaturatedFat"),("ConjugatedLinoleicAcid","PolyunsaturatedFat"),("iOctadecadienoicAcid","PolyunsaturatedFat"),("undifferentiatedOctadecatrienoic","PolyunsaturatedFat"),("OctadecatrienoicAcid","PolyunsaturatedFat"),("ParinaricAcid","PolyunsaturatedFat"),("EicosatrienoicAcid","PolyunsaturatedFat"),("EicosatetraenoicAcid","PolyunsaturatedFat"),("HeneicosapentaenoicAcid","PolyunsaturatedFat"),("22:4","PolyunsaturatedFat"),

    # Subclass TransFat
    ("TransPolyenoicFats","TransFat"),("TransMonoenoicFats","TransFat"),("transPalmitoleicAcid","TransFat"),("transOleicAcid","TransFat"),("transDocosenoicAcid","TransFat"),("transOctadecadienoicAcid","TransFat"),("transtransOctadecadienoicAcid","TransFat"),

    # Subclass Phytosterols
    ("TotalPhytosterols","Phytosterols"),("Stigmasterol","Phytosterols"),("Campesterol","Phytosterols"),("Betasitosterol","Phytosterols"),

    # Subclass EssentialAminoAcid
    ("Histidine","EssentialAminoAcid"),("Isoleucine","EssentialAminoAcid"),("Leucine","EssentialAminoAcid"),("Lysine","EssentialAminoAcid"),("Methionine","EssentialAminoAcid"),("Phenylalanine","EssentialAminoAcid"),("Threonine","EssentialAminoAcid"),("Tryptophan","EssentialAminoAcid"),("Valine","EssentialAminoAcid"),

    # Subclass ConditionallyEssentialAminoAcid
    ("Arginine","ConditionallyEssentialAminoAcid"),("Cystine","ConditionallyEssentialAminoAcid"),("Glycine","ConditionallyEssentialAminoAcid"),("Proline","ConditionallyEssentialAminoAcid"),("Tyrosine","ConditionallyEssentialAminoAcid"),

    # Subclass NonEssentialAminoAcid
    ("Alanine","NonEssentialAminoAcid"),("Asparticacid","NonEssentialAminoAcid"),("Betaine","NonEssentialAminoAcid"),("Glutamicacid","NonEssentialAminoAcid"),("Hydroxyproline","NonEssentialAminoAcid"),("Serine","NonEssentialAminoAcid"),
]

In [125]:
# import csv
# import re
# url = 'C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Nutrition/1st_National_Bagel_Company_Inc_-_1st_National_Bagel_Company_Plain_Presliced_Bagels_366375.csv'
# c = 0
# # Use regex to extract the desired part of the URL
# match = re.search(r'([^/]+)(?=_\d+\.csv$)', url)
# if match:
#     result = match.group(1)
#     print(result)
# with open(url, mode ='r')as file:
#     csvFile = csv.reader(file)
#     # Skip the first row (header)
#     next(csvFile)
#     food_uri = URIRef(ex[result])
#     g.add((food_uri, RDF.type, ex.Food))
#     g.add((food_uri, ex.hasName, Literal(result, datatype=XSD.string)))
#     for lines in csvFile:
#         measureUnity = separate_values_and_units(lines[2])
#         category_name = lines[0]
#         value_name = lines[1]
#         measure = float(measureUnity[0][0])
#         measure_unit = measureUnity[0][1]
#         daily_percentage = float(separate_values_and_units(lines[3])[0][0])
#
#         feature_uri = ex[Match_data_with_csv[c][0]]
#         g.add((food_uri, RDF.type, feature_uri))
#
#         g.add((feature_uri, ex.hasWeight, Literal(measure, datatype=XSD.float)))
#         g.add((feature_uri, ex.hasUnite, Literal(measure_unit, datatype=XSD.string)))
#         g.add((feature_uri, ex.hasDailyValue, Literal(daily_percentage, datatype=XSD.float)))
#
#         c+=1

In [126]:
# import csv
# import re
# import os
# from rdflib import Graph, URIRef, Literal, XSD
#
# folder_path = 'C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Nutrition/'
# count = 0
#
# # Iterate over all files in the specified folder
# for filename in os.listdir(folder_path):
#     if filename.endswith('.csv'):
#         url = os.path.join(folder_path, filename)
#         c = 0
#         # Use regex to extract the desired part of the filename
#         match = re.search(r'([^/]+)(?=_\d+\.csv$)', filename)
#         if match:
#             result = match.group(1)
#             print(result)
#         with open(url, mode='r') as file:
#             csvFile = csv.reader(file)
#             # Skip the first row (header)
#             next(csvFile)
#
#             food_uri = URIRef(ex[result])
#             g.add((food_uri, RDF.type, ex.Food))
#             g.add((food_uri, ex.hasName, Literal(result, datatype=XSD.string)))
#
#             for lines in csvFile:
#                 category_name = lines[0]
#                 value_name = lines[1]
#                 measureUnity = separate_values_and_units(lines[2])
#                 measure = float(measureUnity[0][0])
#                 measure_unit = measureUnity[0][1]
#                 daily_percentage = float(separate_values_and_units(lines[3])[0][0])
#
#                 feature_uri = ex[Match_data_with_csv[c][0]]  # Assuming this part is correct
#
#                 hasWeight_property = URIRef(str(feature_uri) + "/hasWeight")
#                 g.add((food_uri, hasWeight_property, Literal(measure, datatype=XSD.float)))
#
#                 hasUnit_property = URIRef(str(feature_uri) + "/hasUnit")
#                 g.add((food_uri, hasUnit_property, Literal(measure_unit, datatype=XSD.string)))
#
#                 hasDailyValue_property = URIRef(str(feature_uri) + "/hasDailyValue")
#                 g.add((food_uri, hasDailyValue_property, Literal(daily_percentage, datatype=XSD.float)))
#
#                 # print(measure)
#                 # print(measure_unit)
#                 # print(daily_percentage)
#                 # print("")
#
#                 c += 1
#     if count >=0:
#         break
#
#     count+=1

In [13]:
from hashlib import sha256
from rdflib import URIRef

def generate_unique_iri(base_uri, food_uri, nutrient_name, measure):
    unique_id = sha256(f"{food_uri}_{nutrient_name}_{measure}".encode()).hexdigest()[:10]
    return URIRef(f"{base_uri}{nutrient_name.replace(' ', '_')}_{unique_id}")

In [14]:
def separate_values_and_units_boolean(text):
    if(text == '' or text == '--%'):
        return True
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return True

    return False

In [129]:
# import csv
# import re
# import os
# from rdflib import Graph, URIRef, Literal, XSD, BNode
#
# folder_path = 'C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition/'
# count = 0
#
# # Iterate over all files in the specified folder
# for filename in os.listdir(folder_path):
#     if filename.endswith('.csv'):
#         url = os.path.join(folder_path, filename)
#         c = 0
#         # Use regex to extract the desired part of the filename
#         match = re.search(r'([^/]+)(?=_\d+\.csv$)', filename)
#         if match:
#             result = match.group(1)
#             #print(result)
#         with open(url, mode='r') as file:
#             csvFile = csv.reader(file)
#             # Skip the first row (header)
#             next(csvFile)
#
#             # Create food URI (unique per food item)
#             food_uri = URIRef(ex[result])
#             g.add((food_uri, RDF.type, ex.Food))
#             g.add((food_uri, ex.hasName, Literal(result, datatype=XSD.string)))
#
#             # Iterate over the rows in the CSV file and add data to the RDF graph
#             for lines in csvFile:
#                 category_name = lines[0]
#                 if category_name == "Macronutrients" or category_name == "Essential Amino Acids":
#                     value_name = lines[1]
#                     measureUnity = separate_values_and_units(lines[2])
#                     measure = float(measureUnity[0][0])
#                     measure_unit = measureUnity[0][1]
#                     daily_percentage = float(separate_values_and_units(lines[3])[0][0])
#                 else:
#                     c+=1
#                     continue
#
#                 # Create a unique feature URI for each line of data
#                 feature_uri = ex[Match_data_with_csv[c][0]]  # Assuming this part is correct
#
#                 # Generate a unique IRI
#                 nutrient_uri = generate_unique_iri(ex, food_uri, value_name, measure)
#
#                 g.add((nutrient_uri, RDF.type, feature_uri))
#                 g.add((nutrient_uri, ex.hasWeight, Literal(measure, datatype=XSD.float)))
#                 g.add((nutrient_uri, ex.hasUnit, Literal(measure_unit, datatype=XSD.string)))
#                 g.add((nutrient_uri, ex.hasDailyValue, Literal(daily_percentage, datatype=XSD.float)))
#                 g.add((food_uri, ex.hasNutrient, nutrient_uri))  # Connect blank node to food item
#
#                 c += 1
#     if count >= 5000:
#         break
#
#     count += 1
#     if count%1000==0:
#         print(count)


In [15]:
import csv
import re
import os
from rdflib import Graph, URIRef, Literal, XSD, RDF

folder_path = 'C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition/'
count = 0
correct_count = 0

# Define required nutrients
REQUIRED_NUTRIENTS = {
    "Weight", "Calories (kcal)", "Protein", "Histidine", "Isoleucine", "Leucine",
    "Lysine", "Methionine", "Phenylalanine", "Threonine", "Tryptophan", "Valine"
}

# Iterate over all files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        url = os.path.join(folder_path, filename)
        c = 0

        # Extract food name from filename
        match = re.search(r'([^/]+)(?=_\d+\.csv$)', filename)
        if match:
            result = match.group(1)

        # Read the CSV file
        with open(url, mode='r') as file:
            csvFile = csv.reader(file)
            next(csvFile)  # Skip header

            # Track nutrients and triples
            nutrient_tracker = set()
            nutrient_triples = []  # Temporary storage for triples

            food_uri = URIRef(ex[result])

            # Iterate over CSV rows
            for lines in csvFile:
                category_name = lines[0]
                if category_name not in {"Macronutrients", "Essential Amino Acids"}:
                    continue

                value_name = lines[1]
                if not(separate_values_and_units_boolean(lines[2])):
                    measureUnity = separate_values_and_units(lines[2])
                    measure = float(measureUnity[0][0])
                    measure_unit = measureUnity[0][1]
                    daily_percentage = float(separate_values_and_units(lines[3])[0][0])

                    if value_name in REQUIRED_NUTRIENTS:
                        nutrient_tracker.add(value_name)

                        feature_uri = ex[Match_data_with_csv[c][0]]
                        nutrient_uri = generate_unique_iri(ex, food_uri, value_name, measure)

                        # Store triples in a list first
                        nutrient_triples.extend([
                            (nutrient_uri, RDF.type, feature_uri),
                            (nutrient_uri, ex.hasWeight, Literal(measure, datatype=XSD.float)),
                            (nutrient_uri, ex.hasUnit, Literal(measure_unit, datatype=XSD.string)),
                            (nutrient_uri, ex.hasDailyValue, Literal(daily_percentage, datatype=XSD.float)),
                            (food_uri, ex.hasNutrient, nutrient_uri)
                        ])

            # Only add if all required nutrients are present
            if REQUIRED_NUTRIENTS.issubset(nutrient_tracker):
                correct_count+=1
                g.add((food_uri, RDF.type, ex.Food))
                g.add((food_uri, ex.hasName, Literal(result, datatype=XSD.string)))

                # Add all stored triples
                for triple in nutrient_triples:
                    g.add(triple)
            else:
                # Remove stored triples if food does not meet the criteria
                for triple in nutrient_triples:
                    g.remove(triple)

    count += 1
    if count % 2000 == 0:
        print(correct_count)

print(correct_count)
# 80806 values in the end

515
1006
1584
2030
2363
2743
3287
3762
4138
4609
4795
6552
6953
7566
8051
8367
8678
9229
9770
10044
10717
11001
11222
11645
12388
12937
13393
13728
13839
14132
14877
15466
16146
16454
17004
17450
17871
18798
19173
19692
20313
21105
21908
22369
23042
23570
23784
24296
24299
24322
24705
25096
25551
25996
26546
27119
27610
28146
28645
29107
29443
29840
30305
30866
31402
31887
32438
32741
33152
33237
33662
34173
34829
34878
35601
36320
36711
37329
37867
38306
38725
38996
40087
40456
40938
41281
41625
41867
42573
43259
43466
43872
44199
45021
45301
45946
46648
47272
47655
47994
48353
48862
49595
50298
50790
51311
51652
52317
52783
53189
53665
54326
54657
55155
55619
55944
56421
56858
57345
57920
58281
58870
59567
60105
60638
61374
61855
62479
63147
63517
63870
64201
64563
64915
65356
65552
66055
66672
67170
67445
68052
68523
69066
69641
70150
70708
71265
71919
72496
73101
73628
74131
74559
75131
75615
76163
76826
77335
77767
78198
79188
79984
80436
80806


# Checking the graph nodes and edges

In [26]:
food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"  # Adjust to actual node name
food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"

if food_name in G:
    neighbors = list(G.neighbors(food_name))
    print(f"Food Item: {food_name} has relationships with: {neighbors}")
else:
    print("Food not found in graph.")

Food Item: http://example.org/-_Annies_Dressing_Tuscany_Italian has relationships with: ['http://example.org/Leucine_6ce69b8b17', 'http://example.org/Histidine_92f37625a5', '-_Annies_Dressing_Tuscany_Italian', 'http://example.org/Lysine_3caa5fab81', 'http://example.org/Protein_c91116ed2a', 'http://example.org/Phenylalanine_ecd093668f', 'http://example.org/Isoleucine_e8993410f0', 'http://example.org/Threonine_984a59cb3d', 'http://example.org/Valine_946d5dd270', 'http://example.org/Tryptophan_e50cbab6b1', 'http://example.org/Food', 'http://example.org/Calories_(kcal)_838b9ec3f4', 'http://example.org/Methionine_d6f42e16e0']


In [9]:
food_name = "http://example.org/Rb__Confections_Lc__-_Mint_Nonpareils"

if food_name in g:
    neighbors = list(g.neighbors(food_name))
    print(f"Food Item: {food_name} has relationships with: {neighbors}")

ValueError: too many values to unpack (expected 3)

In [27]:
import re

food_name = "http://example.org/-_Annies_Dressing_Tuscany_Italian"  # Adjust to actual node name
search_nutrient = "Leucine"  # User input

if food_name in G:
    neighbors = list(G.neighbors(food_name))

    # Extract only the nutrient name (remove prefix and suffix)
    pattern = r"http://example\.org/([^_]+)_.*"  # Extracts the part between '/' and '_'

    matched_nutrient = [
        n for n in neighbors
        if re.match(pattern, n) and re.match(pattern, n).group(1).lower() == search_nutrient.lower()
    ]

    if matched_nutrient:
        print(f"Food Item: {food_name} has the following '{search_nutrient}' relationships:")
        print(matched_nutrient)

        # Print the neighbors of the matched nutrient
        for nutrient in matched_nutrient:
            if nutrient in G:
                nutrient_neighbors = list(G.neighbors(nutrient))
                print(f"\nNutrient: {nutrient} has relationships with:")
                print(nutrient_neighbors)
            else:
                print(f"\nNo relationships found for {nutrient}.")
    else:
        print(f"No match found for '{search_nutrient}' in {food_name}'s relationships.")
else:
    print("Food not found in graph.")


Food Item: http://example.org/-_Annies_Dressing_Tuscany_Italian has the following 'Leucine' relationships:
['http://example.org/Leucine_6ce69b8b17']

Nutrient: http://example.org/Leucine_6ce69b8b17 has relationships with:
['mg', '0.0', 'http://example.org/Weight']


In [29]:
food_name = "http://example.org/Rb__Confections_Lc__-_Mint_Nonpareils"  # Adjust to actual node name

if food_name in G:
    edges = []

    for _, target, edge_data in G.edges(food_name, data=True):  # ✅ Loop only once
        edge_label = edge_data.get("label", "No label")
        edges.append((food_name, edge_label, target))

    # Print the edges
    print(f"Edges connected to {food_name}:")
    for source, label, target in edges:
        print(f"{source} --[{label}]--> {target}")
else:
    print("Food not found in graph.")


Food not found in graph.


In [30]:
for i in g:
    print(i)

(rdflib.term.URIRef('http://example.org/VitaminB6'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/TotalPolyunsaturated'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/MargaricAcid'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://example.org/unite'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://example.org/Lycopene'))
(rdflib.term.URIRef('http://example.org/Weight'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://example.org/transPalmitoleicAcid'))
(rdflib.term.URIRef('http://example.org/Threonine'), rdflib.term.URIRef('http://ww

# Counting the number of duplicated products

In [1]:
import os
import re
from collections import defaultdict


def process_file_names_with_counter(folder_path, output_file):
    # Dictionary to count occurrences of cleaned names
    name_counter = defaultdict(int)

    # Open the output file
    with open(output_file, "w") as file:
        # Iterate through all files in the folder
        for filename in os.listdir(folder_path):
            # Remove the extension to work only on the name
            name, ext = os.path.splitext(filename)

            # Use regex to match the name without the trailing number
            match = re.match(r"(.*?)[0-9]+$", name)

            if match:
                # Extract the part before the number
                cleaned_name = match.group(1)
            else:
                # If no number at the end, keep the original name
                cleaned_name = name

            # Increment the count for this cleaned name
            name_counter[cleaned_name] += 1

        # Sort the names by their counts in descending order
        sorted_names = sorted(name_counter.items(), key=lambda item: item[1], reverse=True)

        # Write the sorted count of similar cleaned names to the file
        file.write("\nCounts of similar cleaned names (sorted):\n")
        for name, count in sorted_names:
            file.write(f"{name}: {count}\n")

    print(f"Output written to {output_file}")


# Example usage
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"
output_file = "C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Cleaning_steps/output.txt"
process_file_names_with_counter(folder_path, output_file)

#39031 have at least 2 duplicates
# Milk_Whole_ has 247 duplicates

Output written to C:/Users/laure/OneDrive/Documents/GitHub/KG_project/Cleaning_steps/output.txt


# Count how many items have an undefined calorie amount

In [2]:
def separate_values_and_units_boolean(text):
    if(text == '' or text == '--%'):
        return True
    elif(text == '--' or text == '--mcg' or text == '--mg' or text == '--g' or text == '--IU'):
        return True

    return False

In [3]:
import os
import csv


# Function to count files where "calories" is not defined
def count_files_with_undefined_calories(folder_path):
    undefined_calories_count = 0
    files_checked = 0

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):  # Only process CSV files
            file_path = os.path.join(folder_path, file_name)
            files_checked += 1
            calories_defined = False  # Tracks if "calories" are defined in the file

            # Open and read the CSV file
            with open(file_path, mode='r') as file:
                csv_reader = csv.reader(file)

                # Skip the header row
                next(csv_reader)

                # Check each row in the file
                for row in csv_reader:
                    if len(row) < 3:
                        continue  # Skip rows with missing columns

                    value_name = row[1].strip()  # Second column for "value_name"
                    #measure = row[2].strip()  # Third column for "measure" value
                    measureUnity = separate_values_and_units_boolean(row[2])

                    if value_name.lower() == "calories (kcal)":
                        if measureUnity:  # If measure is not empty or invalid
                            calories_defined = False
                        else:
                            calories_defined = True
                        break  # Stop checking further since we found "calories"

            # Increment the count if calories are not defined
            if not calories_defined:
                undefined_calories_count += 1

    print(f"Checked {files_checked} files.")
    return undefined_calories_count


# Specify the folder containing the CSV files
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"

# Call the function and print the result
undefined_count = count_files_with_undefined_calories(folder_path)
print(f"Number of files where 'Calories' is not defined: {undefined_count}")

# Checked 327139 files.
# Number of files where 'Calories' is not defined: 28152

Checked 327139 files.
Number of files where 'Calories' is not defined: 28152


# Count the number of products with a complete Essential amino acids profil

In [4]:
import os
import csv

# List of required essential amino acids
essential_amino_acids = [
    "Histidine", "Isoleucine", "Leucine", "Lysine",
    "Methionine", "Phenylalanine", "Threonine",
    "Tryptophan", "Valine"
]


# Function to count products with all essential amino acids values filled
def count_products_with_all_essential_amino_acids(folder_path):
    products_with_all_essential_amino_acids = 0
    files_checked = 0

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):  # Only process CSV files
            file_path = os.path.join(folder_path, file_name)
            files_checked += 1
            essential_acids_status = {acid: False for acid in essential_amino_acids}

            # Open and read the CSV file
            with open(file_path, mode='r') as file:
                csv_reader = csv.reader(file)

                # Skip the header row
                next(csv_reader)

                # Check each row in the file
                for row in csv_reader:
                    if len(row) < 3:
                        continue  # Skip rows with missing columns

                    category_name = row[0].strip()  # First column for "category_name"
                    value_name = row[1].strip()  # Second column for "value_name"
                    measure = row[2].strip()  # Third column for "measure"
                    measureUnity = separate_values_and_units_boolean(row[2])

                    # Track if the current row corresponds to an essential amino acid entry
                    if category_name.lower() == "essential amino acids" and value_name in essential_amino_acids:
                        if measureUnity:  # If measure is non-empty, mark the amino acid as filled
                            essential_acids_status[value_name] = False
                            break
                        else:
                            essential_acids_status[value_name] = True

            # If all essential amino acids are filled, count this product
            if all(essential_acids_status.values()):

                products_with_all_essential_amino_acids += 1
                #print(file_name)

            if(files_checked%10000 == 0):
                print(files_checked)

    print(f"Checked {files_checked} files.")
    return products_with_all_essential_amino_acids


# Specify the folder containing the CSV files
folder_path = "C:/Users/laure/OneDrive/Documents/GitHub/Dataset_KG/Nutrition"

# Call the function and print the result
filled_count = count_products_with_all_essential_amino_acids(folder_path)
print(f"Number of products with all essential amino acids values filled: {filled_count}")

# Checked 327139 files.
# Number of products with all essential amino acids values filled: 83375

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
Checked 327139 files.
Number of products with all essential amino acids values filled: 83375


# SPARSQL Qeries

In [2]:
from rdflib import Graph
gt = Graph()
gt.parse("C:/Users/laure/OneDrive/Documents/GitHub/KG_project/nutrition_total_ontology.ttl", format="turtle")
#print(f"Graph has {len(g)} triples.")
# 3514824 triples

<Graph identifier=Na0367fc1ea3844e39e16e4d1d8e8551c (<class 'rdflib.graph.Graph'>)>

In [18]:
# SPARQL query
query = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?food ?protein ?proteinAmount ?weight ?weightAmount
WHERE {
    ?food ex:hasNutrient ?protein .
    FILTER(STRSTARTS(STR(?protein), CONCAT(STR(ex:), "Protein_")))
    ?protein ex:hasWeight ?proteinAmount .

    ?food ex:hasNutrient ?weight .
    FILTER(STRSTARTS(STR(?weight), CONCAT(STR(ex:), "Weight_")))
    ?weight ex:hasWeight ?weightAmount .

    FILTER( (?proteinAmount / ?weightAmount) * 100 > 50)
}

"""

# Execute query
results = g.query(query)

# Print results
for row in results:
    # Ensure the values are floats for the calculation
    protein_amount = float(row.proteinAmount)
    weight_amount = float(row.weightAmount)
    normalized_protein = (protein_amount / weight_amount) * 100
    print(f"Food: {row.food}, Protein Amount: {normalized_protein}")


Food: http://example.org/Beverages_Protein_powder_soy_based, Protein Amount: 55.60000000000001
Food: http://example.org/Dried_Salted_Atlantic_Cod, Protein Amount: 62.87499999999999
Food: http://example.org/Dried_Spirulina, Protein Amount: 57.14285714285714
Food: http://example.org/Dry_Powder_Gelatin, Protein Amount: 85.71428571428571
Food: http://example.org/Egg_white_dried, Protein Amount: 81.1
Food: http://example.org/Egg_white_dried_flakes_stabilized_glucose_reduced, Protein Amount: 76.91629955947137
Food: http://example.org/Egg_white_dried_powder_stabilized_glucose_reduced, Protein Amount: 82.85714285714285
Food: http://example.org/Egg_White_Powder, Protein Amount: 84.28571428571429
Food: http://example.org/Fish_whitefish_dried__Alaska_Native_, Protein Amount: 62.4
Food: http://example.org/Flour_soy_defatted, Protein Amount: 51.1
Food: http://example.org/Gelatin_desserts_dry_mix_reduced_calorie_with_aspartame_added_phosphorus_potassium_sodium_vitamin_C, Protein Amount: 55.000000000

KeyboardInterrupt: 

In [152]:
# SPARQL query
query = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?food ?proteinAmount ?caloriesAmount ?weightAmount
WHERE {
    ?food ex:hasNutrient ?protein .
    FILTER(STRSTARTS(STR(?protein), CONCAT(STR(ex:), "Protein_")))
    ?protein ex:hasWeight ?proteinAmount .

    ?food ex:hasNutrient ?weight .
    FILTER(STRSTARTS(STR(?weight), CONCAT(STR(ex:), "Weight_")))
    ?weight ex:hasWeight ?weightAmount .

    OPTIONAL {
        ?food ex:hasNutrient ?calorie .
        FILTER(STRSTARTS(STR(?calorie), CONCAT(STR(ex:), "Calories_")))
        ?calorie ex:hasWeight ?caloriesAmount .
    }

    FILTER ((?proteinAmount / ?weightAmount) * 100 > 0 && bound(?caloriesAmount) && (?caloriesAmount / ?weightAmount) * 100 < 300)
}

"""

# Execute query
results = g.query(query)

# Print results
for row in results:
    # Ensure the values are floats for the calculation
    protein_amount = float(row.proteinAmount)
    weight_amount = float(row.weightAmount)
    calories_amount = float(row.caloriesAmount)

    # Calculate the normalized values
    normalized_protein = (protein_amount / weight_amount) * 100
    normalized_calories = (calories_amount / weight_amount) * 100

    # Print the values along with their normalized values
    print(f"Food: {row.food}, Protein: {protein_amount}, Normalized Protein: {normalized_protein}, Calories: {calories_amount}, Normalized Calories: {normalized_calories}")


95.0
Food: http://example.org/Acorn_stew__Apache_, Protein: 6.8, Calories: 95.0


KeyboardInterrupt: 

In [61]:
# SPARQL query
query = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?food ?proteinAmount ?caloriesAmount
WHERE {
    ?food ex:hasNutrient ?protein .
    FILTER(STRSTARTS(STR(?protein), CONCAT(STR(ex:), "Protein_")))
    ?protein ex:hasWeight ?proteinAmount .

    OPTIONAL {
        ?food ex:hasNutrient ?calorie .
        FILTER(STRSTARTS(STR(?calorie), CONCAT(STR(ex:), "Calories_")))
        ?calorie ex:hasWeight ?caloriesAmount .
    }

    FILTER (
        ?proteinAmount > 0 && bound(?caloriesAmount) && ?caloriesAmount > 0 &&
        (?caloriesAmount / ?proteinAmount) < 10
    )
}

"""

# Execute query
results = g.query(query)

# Print results
for row in results:
    print(row.caloriesAmount)
    print(f"Food: {row.food}, Protein: {row.proteinAmount}, Calories: {row.caloriesAmount}")

## An average human of 70kg should have a daily intake of essential amino acids of : Histidine: 980 mg Isoleucine: 1330 mg Leucine: 2940 mg Lysine: 2660 mg Methionine: 1330 mg Phenylalanine: 2310 mg Threonine: 1400 mg Tryptophan: 350 mg Valine: 1680 mg

https://my.clevelandclinic.org/health/articles/22243-amino-acids

In [160]:

# Define the optimized SPARQL query
query = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?food ?weightAmount ?proteinAmount

       (SUM(?histidine) AS ?HistidineAmount)
       (SUM(?isoleucine) AS ?IsoleucineAmount)
       (SUM(?leucine) AS ?LeucineAmount)
       (SUM(?lysine) AS ?LysineAmount)
       (SUM(?methionine) AS ?MethionineAmount)
       (SUM(?phenylalanine) AS ?PhenylalanineAmount)
       (SUM(?threonine) AS ?ThreonineAmount)
       (SUM(?tryptophan) AS ?TryptophanAmount)
       (SUM(?valine) AS ?ValineAmount)
WHERE {
    ?food ex:hasNutrient ?nutrient .
    ?nutrient ex:hasWeight ?dv .

    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Histidine_")), ?dv, 0) AS ?histidine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Isoleucine_")), ?dv, 0) AS ?isoleucine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Leucine_")), ?dv, 0) AS ?leucine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Lysine_")), ?dv, 0) AS ?lysine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Methionine_")), ?dv, 0) AS ?methionine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Phenylalanine_")), ?dv, 0) AS ?phenylalanine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Threonine_")), ?dv, 0) AS ?threonine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Tryptophan_")), ?dv, 0) AS ?tryptophan)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Valine_")), ?dv, 0) AS ?valine)

    ?food ex:hasNutrient ?weight .
    FILTER(STRSTARTS(STR(?weight), CONCAT(STR(ex:), "Weight_")))
    ?weight ex:hasWeight ?weightAmount .

    ?food ex:hasNutrient ?protein .
    FILTER(STRSTARTS(STR(?protein), CONCAT(STR(ex:), "Protein_")))
    ?protein ex:hasWeight ?proteinAmount .
}
GROUP BY ?food
HAVING (
    SUM(?histidine) > 0 && SUM(?isoleucine) > 0 && SUM(?leucine) > 0 &&
    SUM(?lysine) > 0 && SUM(?methionine) > 0 && SUM(?phenylalanine) > 0 &&
    SUM(?threonine) > 0 && SUM(?tryptophan) > 0 && SUM(?valine) > 0
)

"""

# Execute query
results = g.query(query)

# Print results
print("Foods with a complete essential amino acid profile (DV > 0):")
count = 0
for row in results:
    count += 1

    # Ensure the values are floats for the calculation
    weight_amount = float(row.weightAmount)

    # Normalize the amino acids
    normalized_protein = (float(row.proteinAmount) / weight_amount) * 100
    normalized_histidine = (float(row.HistidineAmount) / weight_amount) * 100
    normalized_isoleucine = (float(row.IsoleucineAmount) / weight_amount) * 100
    normalized_leucine = (float(row.LeucineAmount) / weight_amount) * 100
    normalized_lysine = (float(row.LysineAmount) / weight_amount) * 100
    normalized_methionine = (float(row.MethionineAmount) / weight_amount) * 100
    normalized_phenylalanine = (float(row.PhenylalanineAmount) / weight_amount) * 100
    normalized_threonine = (float(row.ThreonineAmount) / weight_amount) * 100
    normalized_tryptophan = (float(row.TryptophanAmount) / weight_amount) * 100
    normalized_valine = (float(row.ValineAmount) / weight_amount) * 100

    # Print the results with the normalized values
    print(f"""
    {count}. {row.food}
    Normalized Protein : {normalized_protein}
    Normalized Histidine: {normalized_histidine}
    Normalized Isoleucine: {normalized_isoleucine}
    Normalized Leucine: {normalized_leucine}
    Normalized Lysine: {normalized_lysine}
    Normalized Methionine: {normalized_methionine}
    Normalized Phenylalanine: {normalized_phenylalanine}
    Normalized Threonine: {normalized_threonine}
    Normalized Tryptophan: {normalized_tryptophan}
    Normalized Valine: {normalized_valine}
    """)



Foods with a complete essential amino acid profile (DV > 0):

    1. http://example.org/Acorn_stew__Apache_
    Normalized Protein : 6.800000000000001
    Normalized Histidine: 229.99999999999997
    Normalized Isoleucine: 350.0
    Normalized Leucine: 620.0
    Normalized Lysine: 580.0
    Normalized Methionine: 160.0
    Normalized Phenylalanine: 330.0
    Normalized Threonine: 360.0
    Normalized Tryptophan: 40.0
    Normalized Valine: 390.0
    


In [None]:
import csv

# Define the optimized SPARQL query
query = """
PREFIX ex: <http://example.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?food ?weightAmount ?proteinAmount
       (SUM(?histidine) AS ?HistidineAmount)
       (SUM(?isoleucine) AS ?IsoleucineAmount)
       (SUM(?leucine) AS ?LeucineAmount)
       (SUM(?lysine) AS ?LysineAmount)
       (SUM(?methionine) AS ?MethionineAmount)
       (SUM(?phenylalanine) AS ?PhenylalanineAmount)
       (SUM(?threonine) AS ?ThreonineAmount)
       (SUM(?tryptophan) AS ?TryptophanAmount)
       (SUM(?valine) AS ?ValineAmount)
WHERE {
    ?food ex:hasNutrient ?nutrient .
    ?nutrient ex:hasWeight ?dv .

    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Histidine_")), ?dv, 0) AS ?histidine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Isoleucine_")), ?dv, 0) AS ?isoleucine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Leucine_")), ?dv, 0) AS ?leucine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Lysine_")), ?dv, 0) AS ?lysine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Methionine_")), ?dv, 0) AS ?methionine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Phenylalanine_")), ?dv, 0) AS ?phenylalanine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Threonine_")), ?dv, 0) AS ?threonine)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Tryptophan_")), ?dv, 0) AS ?tryptophan)
    BIND(IF(STRSTARTS(STR(?nutrient), CONCAT(STR(ex:), "Valine_")), ?dv, 0) AS ?valine)

    ?food ex:hasNutrient ?weight .
    FILTER(STRSTARTS(STR(?weight), CONCAT(STR(ex:), "Weight_")))
    ?weight ex:hasWeight ?weightAmount .

    ?food ex:hasNutrient ?protein .
    FILTER(STRSTARTS(STR(?protein), CONCAT(STR(ex:), "Protein_")))
    ?protein ex:hasWeight ?proteinAmount .
}
GROUP BY ?food
HAVING (
    SUM(?histidine) > 0 && SUM(?isoleucine) > 0 && SUM(?leucine) > 0 &&
    SUM(?lysine) > 0 && SUM(?methionine) > 0 && SUM(?phenylalanine) > 0 &&
    SUM(?threonine) > 0 && SUM(?tryptophan) > 0 && SUM(?valine) > 0
)
"""

# Execute query
results = gt.query(query)

# Open the CSV file to write the results
with open('C:/Users/laure/OneDrive/Documents/GitHub/KG_project/food_amino_acids.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow([
        "Food", "Normalized Protein", "Normalized Histidine", "Normalized Isoleucine", "Normalized Leucine",
        "Normalized Lysine", "Normalized Methionine", "Normalized Phenylalanine", "Normalized Threonine",
        "Normalized Tryptophan", "Normalized Valine"
    ])

    count = 0
    for row in results:
        count += 1

        # Ensure the values are floats for the calculation
        weight_amount = float(row.weightAmount)

        # Normalize the amino acids
        normalized_protein = (float(row.proteinAmount) / weight_amount) * 100
        normalized_histidine = (float(row.HistidineAmount) / weight_amount) * 100
        normalized_isoleucine = (float(row.IsoleucineAmount) / weight_amount) * 100
        normalized_leucine = (float(row.LeucineAmount) / weight_amount) * 100
        normalized_lysine = (float(row.LysineAmount) / weight_amount) * 100
        normalized_methionine = (float(row.MethionineAmount) / weight_amount) * 100
        normalized_phenylalanine = (float(row.PhenylalanineAmount) / weight_amount) * 100
        normalized_threonine = (float(row.ThreonineAmount) / weight_amount) * 100
        normalized_tryptophan = (float(row.TryptophanAmount) / weight_amount) * 100
        normalized_valine = (float(row.ValineAmount) / weight_amount) * 100

        # Write the data row to the CSV file
        writer.writerow([
            row.food, normalized_protein, normalized_histidine, normalized_isoleucine, normalized_leucine,
            normalized_lysine, normalized_methionine, normalized_phenylalanine, normalized_threonine,
            normalized_tryptophan, normalized_valine
        ])

print("Data saved to 'food_amino_acids.csv'")
