In [None]:
pip install -q sentence_transformers faiss-cpu huggingface_hub

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Step 1: Load the dataset
df = pd.read_csv("/content/full_dataset.csv")  # Replace with your dataset path

# Step 2: Preprocess the data
# Combine food and disease entities into a single text field
df["food_disease_pair"] = df["food_entity"] + " " + df["disease_entity"]

# Create a combined label: 1 for recommend, -1 for avoid, 0 for neutral
df["label"] = df["is_treat"] - df["is_cause"]

# Step 3: Generate embeddings
# Load a pre-trained sentence embedding model (e.g., Sentence-BERT)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for food-disease pairs
embeddings = model.encode(df["food_disease_pair"].tolist())

# Step 4: Build FAISS index
# Convert embeddings to numpy array
embeddings = np.array(embeddings).astype("float32")

# Initialize FAISS index
dimension = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search

# Add embeddings to the index
index.add(embeddings)

# Step 5: Save the FAISS index and preprocessed data
faiss.write_index(index, "food_disease_index.faiss")
df.to_csv("preprocessed_data.csv", index=False)

print("Training complete! FAISS index and preprocessed data saved.")

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Step 1: Load preprocessed data and FAISS index
df = pd.read_csv("preprocessed_data.csv")  # Preprocessed dataset
index = faiss.read_index("food_disease_index.faiss")  # FAISS index

# Step 2: Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 3: Define the recommendation function
def recommend_foods(disease: str, k: int = 5):
    """
    Recommend foods to eat or avoid based on a disease.

    Args:
        disease (str): The disease input by the user.
        k (int): Number of top recommendations to return.

    Returns:
        dict: A dictionary containing recommended and avoid foods.
    """
    # Generate embedding for the query (disease)
    query_embedding = model.encode([disease])

    # Perform similarity search using FAISS
    distances, indices = index.search(query_embedding, k)

    # Get top recommendations
    top_recommendations = df.iloc[indices[0]]

    # Filter by label
    recommend_foods = top_recommendations[top_recommendations["label"] == 1]["food_entity"].tolist()
    avoid_foods = top_recommendations[top_recommendations["label"] == -1]["food_entity"].tolist()

    return {"recommend": recommend_foods, "avoid": avoid_foods}


In [None]:
pip install groq

In [None]:
from groq import Groq

# Initialize the Groq client
client = Groq(api_key='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')

def generate_reasoning(disease: str, recommendations: dict):
    """
    Generate reasoning and summary using Groq's API.

    Args:
        disease (str): The disease input by the user.
        recommendations (dict): A dictionary containing recommended and avoid foods.

    Returns:
        dict: A dictionary containing the reasoning and summary.
    """
    # Prepare the prompt
    prompt = (
        f"For a patient with {disease}, the system recommends eating {recommendations['recommend']} and avoiding {recommendations['avoid']}. "
        f"Can you explain why these recommendations are made and provide additional medical advice? "
        f"Please provide your reasoning inside <think> tags and the final summary/recommendations after the </think> tag."
    )

    # Generate the response using Groq's API
    completion = client.chat.completions.create(
        model="deepseek-r1-distill-llama-70b",  # Use the desired model
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful, respectful and honest medical assistant. "
                    "Always answer as helpfully as possible, while being safe. "
                    "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
                    "Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
                    "If you don’t know the answer to a question, please don’t share false information."
                ),
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.6,  # Control the randomness of the output
        max_tokens=4096,  # Maximum number of tokens to generate
        top_p=0.95,  # Nucleus sampling parameter
        stream=False,  # Set to False for a single response
        stop=None,  # No specific stop tokens
    )

    # Extract the response
    response = completion.choices[0].message.content

    # Split the response into reasoning and summary
    if "<think>" in response and "</think>" in response:
        reasoning = response.split("<think>")[1].split("</think>")[0].strip()
        summary = response.split("</think>")[1].strip()
    else:
        reasoning = "No reasoning provided."
        summary = response.strip()

    return {"reasoning": reasoning, "summary": summary}

In [None]:
# Step 1: Get recommendations
disease = "diabetes"
recommendations = recommend_foods(disease)

# Step 2: Generate reasoning and summary using Groq's API
result = generate_reasoning(disease, recommendations)

# Step 3: Display results
print(f"For {disease}:")
print("- Foods to Eat:", recommendations["recommend"])
print("- Foods to Avoid:", recommendations["avoid"])
print("\nReasoning:")
print(result["reasoning"])
print("\nSummary and Recommendations:")
print(result["summary"])

In [None]:
pip install -q streamlit

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from groq import Groq
import streamlit as st

# Set up the Streamlit app
st.set_page_config(page_title="NutriMumbai AI", page_icon="🍏", layout="wide")

# Custom CSS for styling
st.markdown(
    """
    <style>
    .stApp {
        background-color: #f5f5f5;
    }
    .stButton>button {
        background-color: #4CAF50;
        color: white;
        font-size: 16px;
        padding: 10px 24px;
        border-radius: 8px;
    }
    .stTextInput>div>div>input {
        font-size: 16px;
        padding: 10px;
    }
    .stMarkdown h1 {
        color: #4CAF50;
    }
    .stMarkdown h2 {
        color: #2E86C1;
    }
    .stMarkdown h3 {
        color: #2E86C1;
    }
    </style>
    """,
    unsafe_allow_html=True,
)

# Title and description
st.title("🍏 NutriMumbai AI")
st.markdown("Your AI-powered dietary companion for managing diseases and staying healthy in Mumbai.")
st.markdown("---")

# Step 1: Load preprocessed data and FAISS index
@st.cache_resource
def load_data_and_model():
    df = pd.read_csv("preprocessed_data.csv")  # Preprocessed dataset
    index = faiss.read_index("food_disease_index.faiss")  # FAISS index
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Embedding model
    return df, index, model

df, index, model = load_data_and_model()

# Step 2: Define the recommendation function
def recommend_foods(disease: str, k: int = 5):
    """
    Recommend foods to eat or avoid based on a disease.

    Args:
        disease (str): The disease input by the user.
        k (int): Number of top recommendations to return.

    Returns:
        dict: A dictionary containing recommended and avoid foods.
    """
    # Generate embedding for the query (disease)
    query_embedding = model.encode([disease])

    # Perform similarity search using FAISS
    distances, indices = index.search(query_embedding, k)

    # Get top recommendations
    top_recommendations = df.iloc[indices[0]]

    # Filter by label
    recommend_foods = top_recommendations[top_recommendations["label"] == 1]["food_entity"].tolist()
    avoid_foods = top_recommendations[top_recommendations["label"] == -1]["food_entity"].tolist()

    return {"recommend": recommend_foods, "avoid": avoid_foods}

# Step 3: Initialize the Groq client
client = Groq(api_key='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')

# Step 4: Define the reasoning function
def generate_reasoning(disease: str, recommendations: dict):
    """
    Generate reasoning and summary using Groq's API.

    Args:
        disease (str): The disease input by the user.
        recommendations (dict): A dictionary containing recommended and avoid foods.

    Returns:
        dict: A dictionary containing the reasoning and summary.
    """
    # Prepare the prompt
    prompt = (
        f"For a patient with {disease}, the system recommends eating {recommendations['recommend']} and avoiding {recommendations['avoid']}. "
        f"Can you explain why these recommendations are made and provide additional medical advice? "
        f"Please provide your reasoning inside <think> tags and the final summary/recommendations after the </think> tag."
    )

    # Generate the response using Groq's API
    completion = client.chat.completions.create(
        model="deepseek-r1-distill-llama-70b",  # Use the desired model
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful, respectful and honest medical assistant. "
                    "Always answer as helpfully as possible, while being safe. "
                    "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
                    "Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
                    "If you don’t know the answer to a question, please don’t share false information."
                ),
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.6,  # Control the randomness of the output
        max_tokens=4096,  # Maximum number of tokens to generate
        top_p=0.95,  # Nucleus sampling parameter
        stream=False,  # Set to False for a single response
        stop=None,  # No specific stop tokens
    )

    # Extract the response
    response = completion.choices[0].message.content

    # Split the response into reasoning and summary
    if "<think>" in response and "</think>" in response:
        reasoning = response.split("<think>")[1].split("</think>")[0].strip()
        summary = response.split("</think>")[1].strip()
    else:
        reasoning = "No reasoning provided."
        summary = response.strip()

    return {"reasoning": reasoning, "summary": summary}

# Step 5: Streamlit UI
st.sidebar.title("Settings")
disease = st.sidebar.text_input("Enter the disease (e.g., diabetes):", "diabetes")
k = st.sidebar.slider("Number of recommendations:", min_value=1, max_value=10, value=5)

# Step 6: Get recommendations and reasoning
if st.sidebar.button("Get Recommendations"):
    with st.spinner("Generating recommendations..."):
        recommendations = recommend_foods(disease, k)
        result = generate_reasoning(disease, recommendations)

    # Display results
    st.subheader(f"Recommendations for {disease}:")
    col1, col2 = st.columns(2)
    with col1:
        st.markdown("### 🍎 Foods to Eat")
        for food in recommendations["recommend"]:
            st.markdown(f"- {food}")
    with col2:
        st.markdown("### 🚫 Foods to Avoid")
        for food in recommendations["avoid"]:
            st.markdown(f"- {food}")

    st.markdown("---")
    st.subheader("Reasoning")
    st.markdown(result["reasoning"])

    st.markdown("---")
    st.subheader("Summary and Recommendations")
    st.markdown(result["summary"])

# Footer
st.markdown("---")
st.markdown("Built with ❤️ by NutriMumbai AI")

# Knowledge Graph Based Model

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/full_dataset.csv")

# Create an empty list to store relationships
relationships = []

# Iterate through the dataset and extract relationships
for index, row in df.iterrows():
    food = row["food_entity"]
    disease = row["disease_entity"]

    if row["is_cause"] == 1:
        relationships.append((food, "causes", disease))
    if row["is_treat"] == 1:
        relationships.append((food, "treats", disease))

# Convert the list into a DataFrame
kg_df = pd.DataFrame(relationships, columns=["source", "relationship", "target"])

kg_df

In [None]:
kg_df.to_csv("knowledge_graph_relationships.csv", index=False)

In [None]:
pip install networkx

In [None]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges
for index, row in kg_df.iterrows():
    G.add_edge(row["source"], row["target"], relationship=row["relationship"])

# Visualize the graph (optional)
import matplotlib.pyplot as plt
nx.draw(G, with_labels=True, font_weight="bold")
plt.show()

In [None]:
G

In [None]:
import pickle
# Save the graph to a .pkl file
with open("knowledge_graph.pkl", "wb") as f:
    pickle.dump(G, f)

print("Knowledge graph saved as 'knowledge_graph.pkl'")

In [None]:
# Get all foods that treat diabetes
for source, target, data in G.edges(data=True):
    if data["relationship"] == "treats" and target == "diabetes":
        print(source)

In [None]:
# Get all foods that cause diabetes
for source, target, data in G.edges(data=True):
    if data["relationship"] == "causes" and target == "diabetes":
        print(source)

In [None]:
df.shape

In [None]:
df

In [None]:
df[df['is_cause'==0]]

# Extracted Features From Raw Dataset

In [None]:
!git clone https://github.com/gjorgjinac/food-disease-dataset.git

In [None]:
import pandas as pd
import os

# Define the path to the dataset
dataset_path = "/content/food-disease-dataset/splits"

# Initialize empty lists to store data
all_train_data = []
all_val_data = []
all_test_data = []

# Combine data from cause_folds
cause_folds_path = os.path.join(dataset_path, "cause_folds")
for fold in range(0, 10):  # Assuming there are 10 folds
    fold_path = os.path.join(cause_folds_path, f"fold{fold}")
    train_data = pd.read_csv(os.path.join(fold_path, "train.csv"))
    val_data = pd.read_csv(os.path.join(fold_path, "val.csv"))
    test_data = pd.read_csv(os.path.join(fold_path, "test.csv"))
    all_train_data.append(train_data)
    all_val_data.append(val_data)
    all_test_data.append(test_data)

# Combine data from treat_folds
treat_folds_path = os.path.join(dataset_path, "treat_folds")
for fold in range(0, 10):  # Assuming there are 10 folds
    fold_path = os.path.join(treat_folds_path, f"fold{fold}")
    train_data = pd.read_csv(os.path.join(fold_path, "train.csv"))
    val_data = pd.read_csv(os.path.join(fold_path, "val.csv"))
    test_data = pd.read_csv(os.path.join(fold_path, "test.csv"))
    all_train_data.append(train_data)
    all_val_data.append(val_data)
    all_test_data.append(test_data)

# Combine all data into single DataFrames
combined_train_data = pd.concat(all_train_data, ignore_index=True)
combined_val_data = pd.concat(all_val_data, ignore_index=True)
combined_test_data = pd.concat(all_test_data, ignore_index=True)

# Save the combined datasets (optional)
combined_train_data.to_csv("combined_train_data.csv", index=False)
combined_val_data.to_csv("combined_val_data.csv", index=False)
combined_test_data.to_csv("combined_test_data.csv", index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Select features and target for training
train_features = combined_train_data[[
    "bert_cause_cs_pairs", "bert_treat_cs_pairs",
    "roberta_cause_cs_pairs", "roberta_treat_cs_pairs",
    "biobert_cause_cs_pairs", "biobert_treat_cs_pairs"
]]
train_target = combined_train_data["is_cause"]  # or combined_train_data["is_treat"]

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(train_features, train_target)


In [None]:
# Evaluate on validation data
val_features = combined_val_data[[
    "bert_cause_cs_pairs", "bert_treat_cs_pairs",
    "roberta_cause_cs_pairs", "roberta_treat_cs_pairs",
    "biobert_cause_cs_pairs", "biobert_treat_cs_pairs"
]]
val_target = combined_val_data["is_cause"]  # or combined_val_data["is_treat"]

val_pred = clf.predict(val_features)
print("Validation Results:")
print(classification_report(val_target, val_pred))


In [None]:

# Evaluate on test data
test_features = combined_test_data[[
    "bert_cause_cs_pairs", "bert_treat_cs_pairs",
    "roberta_cause_cs_pairs", "roberta_treat_cs_pairs",
    "biobert_cause_cs_pairs", "biobert_treat_cs_pairs"
]]
test_target = combined_test_data["is_cause"]  # or combined_test_data["is_treat"]

test_pred = clf.predict(test_features)
print("Test Results:")
print(classification_report(test_target, test_pred))

In [None]:
import pickle

# Save the Random Forest model
with open("rf_model.pkl", "wb") as f:
  pickle.dump(clf, f)

In [None]:
import pickle
import pandas as pd
# Load the trained model
# clf = joblib.load("random_forest_model.pkl")
with open("rf_model.pkl", "rb") as f:
  clf = pickle.load( f)

# Example: User inputs a disease
user_disease = "asthma"

combined_train_data = pd.read_csv('combined_train_data.csv')
# Get all food-disease pairs for the user's disease
food_disease_pairs = combined_train_data[combined_train_data["term2"] == user_disease]

# Select features for inference
inference_features = food_disease_pairs[[
    "bert_cause_cs_pairs", "bert_treat_cs_pairs",
    "roberta_cause_cs_pairs", "roberta_treat_cs_pairs",
    "biobert_cause_cs_pairs", "biobert_treat_cs_pairs"
]]

# Predict relationships
predictions = clf.predict(inference_features)

# Add predictions to the DataFrame
food_disease_pairs["prediction"] = predictions

# Generate recommendations
recommend_foods = food_disease_pairs[food_disease_pairs["prediction"] == 0]["term1"].tolist()
avoid_foods = food_disease_pairs[food_disease_pairs["prediction"] == 1]["term1"].tolist()

# Display recommendations
print(f"For {user_disease}:")
print("- Foods to Eat:", recommend_foods)
print("- Foods to Avoid:", avoid_foods)

In [None]:
pip list

In [None]:
!pip install wikipedia-api


In [None]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyProjectName (merlin@example.com)', language='en')

page_py = wiki_wiki.page('Python_(programming_language)')

In [None]:
page_py = wiki_wiki.page('Asthma')
print("Page - Exists: %s" % page_py.exists())
# Page - Exists: True

page_missing = wiki_wiki.page('NonExistingPageWithStrangeName')
print("Page - Exists: %s" %     page_missing.exists())
# Page - Exists: False

In [None]:
wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='MyProjectName (merlin@example.com)',
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

p_wiki = wiki_wiki.page("Asthma")
print(p_wiki.text)

