In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

print(" Libraries imported successfully!")

 Libraries imported successfully!


[nltk_data] Downloading package punkt to /home/basar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/basar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Load Processed Datasets

In [None]:
# Load all processed datasets
print("üìÇ Loading datasets...\n")

try:
    df_nutrition = pd.read_csv('../datasets/processed/nutrition_processed.csv')
    print(f"‚úì Nutrition: {len(df_nutrition)} records")
except:
    print("‚ö† Nutrition dataset not found")
    df_nutrition = pd.DataFrame()

try:
    df_exercise = pd.read_csv('../datasets/processed/exercise_processed.csv')
    print(f"‚úì Exercise: {len(df_exercise)} records")
except:
    print("‚ö† Exercise dataset not found")
    df_exercise = pd.DataFrame()

try:
    df_qa = pd.read_csv('../datasets/processed/medical_qa_processed.csv')
    print(f"‚úì Medical Q&A: {len(df_qa)} records")
except:
    print("‚ö† Medical Q&A dataset not found")
    df_qa = pd.DataFrame()

try:
    df_pregnancy = pd.read_csv('../datasets/processed/pregnancy_processed.csv')
    print(f"‚úì Pregnancy: {len(df_pregnancy)} records")
except:
    print("‚ö† Pregnancy dataset not found")
    df_pregnancy = pd.DataFrame()

try:
    df_womens = pd.read_csv('../datasets/processed/womens_health_processed.csv')
    print(f"‚úì Women's Health: {len(df_womens)} records")
except:
    print("‚ö† Women's Health dataset not found")
    df_womens = pd.DataFrame()

try:
    knowledge_base = pd.read_csv('../datasets/processed/knowledge_base.csv')
    print(f"‚úì Knowledge Base: {len(knowledge_base)} entries")
except:
    print("‚ö† Knowledge Base not found")
    knowledge_base = pd.DataFrame()

print("\n‚úÖ Datasets loaded successfully!")

üìÇ Loading datasets...

‚úì Nutrition: 8 records
‚úì Exercise: 35 records
‚úì Medical Q&A: 8 records
‚úì Pregnancy: 6 records
‚úì Women's Health: 6 records
‚úì Knowledge Base: 35 entries

‚úÖ Datasets loaded successfully!


## 2. Train Q&A Model (TF-IDF + Cosine Similarity)

In [None]:
print("ü§ñ Training Q&A Model...\n")

if not knowledge_base.empty:
    # Prepare Q&A data
    questions = knowledge_base['question'].fillna('').tolist()
    answers = knowledge_base['answer'].fillna('').tolist()
    categories = knowledge_base['category'].fillna('general').tolist()
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=500,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=1
    )
    
    # Fit on questions
    question_vectors = vectorizer.fit_transform(questions)
    
    print(f"‚úì Vectorizer trained on {len(questions)} questions")
    print(f"‚úì Vocabulary size: {len(vectorizer.vocabulary_)}")
    
    # Test the model
    test_questions = [
        "How to improve health?",
        "What is healthy blood sugar?",
        "Best foods for heart?"
    ]
    
    print("\nüß™ Testing Q&A Model:\n")
    for test_q in test_questions:
        test_vec = vectorizer.transform([test_q])
        similarities = cosine_similarity(test_vec, question_vectors)[0]
        best_match_idx = similarities.argmax()
        
        if similarities[best_match_idx] > 0.1:
            print(f"Q: {test_q}")
            print(f"A: {answers[best_match_idx][:100]}...")
            print(f"Similarity: {similarities[best_match_idx]:.2f}\n")
    
    # Save model
    joblib.dump(vectorizer, '../models/qa_vectorizer.pkl')
    
    # Save Q&A database
    qa_db = {
        'questions': questions,
        'answers': answers,
        'categories': categories
    }
    joblib.dump(qa_db, '../models/qa_database.pkl')
    
    print("‚úÖ Q&A Model saved successfully!")
else:
    print(" No knowledge base available for training")

ü§ñ Training Q&A Model...

‚úì Vectorizer trained on 35 questions
‚úì Vocabulary size: 108

üß™ Testing Q&A Model:

Q: How to improve health?
A: To improve health: eat balanced diet, exercise regularly, get 7-8 hours sleep, stay hydrated, manage...
Similarity: 1.00

Q: What is healthy blood sugar?
A: Normal fasting blood sugar: 70-100 mg/dL. Prediabetes: 100-125. Diabetes: 126+. After meals: below 1...
Similarity: 0.83

Q: Best foods for heart?
A: Heart-healthy foods include: salmon, walnuts, berries, oats, dark chocolate, leafy greens, avocado, ...
Similarity: 0.52

‚úÖ Q&A Model saved successfully!


## 3. Train Calorie Prediction Model

In [None]:
print("üçΩ Training Calorie Prediction Model...\n")

if not df_nutrition.empty and 'protein' in df_nutrition.columns:
    # Prepare features
    X = df_nutrition[['protein', 'carbs', 'fat']]
    y = df_nutrition['calories']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    calorie_model = RandomForestRegressor(n_estimators=100, random_state=42)
    calorie_model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = calorie_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print(f"‚úì Model trained on {len(X_train)} samples")
    print(f"‚úì R¬≤ Score: {r2:.4f}")
    print(f"‚úì RMSE: {rmse:.2f} calories\n")
    
    # Test predictions
    print("üß™ Test Predictions:\n")
    for i in range(min(3, len(X_test))):
        print(f"Protein: {X_test.iloc[i]['protein']}g, Carbs: {X_test.iloc[i]['carbs']}g, Fat: {X_test.iloc[i]['fat']}g")
        print(f"Predicted: {y_pred[i]:.0f} cal, Actual: {y_test.iloc[i]:.0f} cal\n")
    
    # Save model
    joblib.dump(calorie_model, '../models/calorie_predictor.pkl')
    print("‚úÖ Calorie Prediction Model saved!")
else:
    print(" Insufficient nutrition data for training")

üçΩ Training Calorie Prediction Model...



‚úì Model trained on 6 samples
‚úì R¬≤ Score: 0.6641
‚úì RMSE: 35.64 calories

üß™ Test Predictions:

Protein: 31.0g, Carbs: 0.0g, Fat: 3.6g
Predicted: 140 cal, Actual: 165 cal

Protein: 3.4g, Carbs: 5.0g, Fat: 1.0g
Predicted: 86 cal, Actual: 42 cal

‚úÖ Calorie Prediction Model saved!


## 4. Train Exercise Recommendation Model

In [None]:
print("üèÉ Training Exercise Recommendation Model...\n")

if not df_exercise.empty and 'body_weight' in df_exercise.columns:
    # Prepare features
    X = df_exercise[['body_weight', 'calories_per_hour']]
    
    # Encode exercise names
    le_exercise = LabelEncoder()
    y = le_exercise.fit_transform(df_exercise['exercise_name'])
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    exercise_model = RandomForestClassifier(n_estimators=100, random_state=42)
    exercise_model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = exercise_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"‚úì Model trained on {len(X_train)} samples")
    print(f"‚úì Accuracy: {accuracy:.4f}\n")
    
    # Save model
    joblib.dump(exercise_model, '../models/exercise_recommender.pkl')
    joblib.dump(le_exercise, '../models/exercise_encoder.pkl')
    print("‚úÖ Exercise Recommendation Model saved!")
else:
    print(" Insufficient exercise data for training")

üèÉ Training Exercise Recommendation Model...

‚úì Model trained on 28 samples
‚úì Accuracy: 0.0000

‚úÖ Exercise Recommendation Model saved!


## 5. Create Health Advisor Knowledge Base

In [15]:
print("üè• Creating Health Advisor Knowledge Base...\n")

# Combine all health knowledge
health_knowledge = {
    'bmi_ranges': {
        'underweight': {'range': (0, 18.5), 'advice': 'Increase calorie intake with nutritious foods. Consult a nutritionist.'},
        'normal': {'range': (18.5, 24.9), 'advice': 'Maintain current healthy lifestyle and balanced diet.'},
        'overweight': {'range': (25, 29.9), 'advice': 'Reduce calorie intake, increase physical activity. Consider portion control.'},
        'obese': {'range': (30, 100), 'advice': 'Consult healthcare provider. Focus on gradual weight loss through diet and exercise.'}
    },
    'blood_pressure': {
        'normal': {'range': {'systolic': (90, 120), 'diastolic': (60, 80)}, 'advice': 'Maintain healthy lifestyle'},
        'elevated': {'range': {'systolic': (120, 129), 'diastolic': (60, 80)}, 'advice': 'Reduce salt, exercise regularly'},
        'high': {'range': {'systolic': (130, 200), 'diastolic': (80, 120)}, 'advice': 'Consult doctor, medication may be needed'}
    },
    'blood_glucose': {
        'normal': {'range': (70, 100), 'advice': 'Maintain balanced diet and regular exercise'},
        'prediabetes': {'range': (100, 125), 'advice': 'Reduce sugar intake, increase physical activity, monitor regularly'},
        'diabetes': {'range': (126, 500), 'advice': 'Consult doctor immediately, medication and diet control essential'}
    },
    'daily_water': {
        'formula': 'body_weight_kg * 0.033',  # liters
        'min': 2.0,
        'max': 4.0
    },
    'daily_steps': {
        'sedentary': 5000,
        'moderate': 7500,
        'active': 10000,
        'very_active': 12500
    },
    'sleep_hours': {
        'adult': (7, 9),
        'teenager': (8, 10),
        'child': (9, 12)
    }
}

# Add pregnancy knowledge
if not df_pregnancy.empty:
    pregnancy_dict = df_pregnancy.set_index('week').to_dict('index')
    health_knowledge['pregnancy'] = pregnancy_dict

# Add women's health knowledge
if not df_womens.empty:
    womens_health_dict = df_womens.set_index('symptom').to_dict('index')
    health_knowledge['womens_health'] = womens_health_dict

# Save knowledge base
with open('../models/health_knowledge.json', 'w') as f:
    json.dump(health_knowledge, f, indent=2)

print("‚úÖ Health Knowledge Base created!")
print(f"\nüìä Knowledge Base contains:")
print(f"  - BMI Guidelines: {len(health_knowledge['bmi_ranges'])} categories")
print(f"  - Blood Pressure: {len(health_knowledge['blood_pressure'])} categories")
print(f"  - Blood Glucose: {len(health_knowledge['blood_glucose'])} categories")
print(f"  - Daily Water Calculator")
print(f"  - Step Goals: {len(health_knowledge['daily_steps'])} levels")
print(f"  - Sleep Recommendations")
if 'pregnancy' in health_knowledge:
    print(f"  - Pregnancy Guidance: {len(health_knowledge['pregnancy'])} weeks")
if 'womens_health' in health_knowledge:
    print(f"  - Women's Health: {len(health_knowledge['womens_health'])} symptoms")

üè• Creating Health Advisor Knowledge Base...

‚úÖ Health Knowledge Base created!

üìä Knowledge Base contains:
  - BMI Guidelines: 4 categories
  - Blood Pressure: 3 categories
  - Blood Glucose: 3 categories
  - Daily Water Calculator
  - Step Goals: 4 levels
  - Sleep Recommendations
  - Pregnancy Guidance: 6 weeks
  - Women's Health: 6 symptoms


## 6. Model Summary & Evaluation

In [18]:

print("‚úÖ HealthNest AI Model Training Complete!")

print("\nüì¶ Trained Models:")
print("  1. Q&A Model (TF-IDF + Cosine Similarity)")
print("  2. Calorie Prediction Model (Random Forest Regressor)")
print("  3. Exercise Recommendation Model (Random Forest Classifier)")
print("  4. Health Knowledge Base (Rule-based System)")

print("\nüíæ Saved Files:")
print("  - ../models/qa_vectorizer.pkl")
print("  - ../models/qa_database.pkl")
print("  - ../models/calorie_predictor.pkl")
print("  - ../models/exercise_recommender.pkl")
print("  - ../models/exercise_encoder.pkl")
print("  - ../models/health_knowledge.json")

print("\nüìù Next Steps:")
print("  1. Test models using test_model.py")
print("  2. Deploy backend API using Flask")
print("  3. Build frontend chatbot UI")
print("  4. Integrate with HealthNest app")

‚úÖ HealthNest AI Model Training Complete!

üì¶ Trained Models:
  1. Q&A Model (TF-IDF + Cosine Similarity)
  2. Calorie Prediction Model (Random Forest Regressor)
  3. Exercise Recommendation Model (Random Forest Classifier)
  4. Health Knowledge Base (Rule-based System)

üíæ Saved Files:
  - ../models/qa_vectorizer.pkl
  - ../models/qa_database.pkl
  - ../models/calorie_predictor.pkl
  - ../models/exercise_recommender.pkl
  - ../models/exercise_encoder.pkl
  - ../models/health_knowledge.json

üìù Next Steps:
  1. Test models using test_model.py
  2. Deploy backend API using Flask
  3. Build frontend chatbot UI
  4. Integrate with HealthNest app


## 7. Save Model Metadata

In [19]:
# Save model metadata
metadata = {
    'project': 'HealthNest AI Health Assistant',
    'version': '1.0.0',
    'date_trained': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'models': {
        'qa_model': {
            'type': 'TF-IDF + Cosine Similarity',
            'knowledge_base_size': len(knowledge_base) if not knowledge_base.empty else 0,
            'vocabulary_size': len(vectorizer.vocabulary_) if 'vectorizer' in locals() else 0
        },
        'calorie_predictor': {
            'type': 'Random Forest Regressor',
            'training_samples': len(X_train) if 'X_train' in locals() else 0,
            'r2_score': r2 if 'r2' in locals() else None
        },
        'exercise_recommender': {
            'type': 'Random Forest Classifier',
            'accuracy': accuracy if 'accuracy' in locals() else None
        }
    },
    'datasets': {
        'nutrition': len(df_nutrition) if not df_nutrition.empty else 0,
        'exercise': len(df_exercise) if not df_exercise.empty else 0,
        'medical_qa': len(df_qa) if not df_qa.empty else 0,
        'pregnancy': len(df_pregnancy) if not df_pregnancy.empty else 0,
        'womens_health': len(df_womens) if not df_womens.empty else 0
    }
}

with open('../models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("‚úÖ Model metadata saved!")
print("\nüìÑ View metadata: ../models/model_metadata.json")

‚úÖ Model metadata saved!

üìÑ View metadata: ../models/model_metadata.json
