
 Project 1: Food Nutrition Predictor using Linear Regression
Predict calories from food features

In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Create synthetic food nutrition dataset
def create_food_dataset():
    np.random.seed(42)
    n_samples = 500
    
    # Features: food characteristics (simplified)
    data = {
        'weight_grams': np.random.randint(50, 500, n_samples),  # Food weight
        'carb_density': np.random.uniform(0.1, 0.8, n_samples),  # Carb proportion
        'protein_density': np.random.uniform(0.05, 0.4, n_samples),  # Protein proportion
        'fat_density': np.random.uniform(0.01, 0.5, n_samples),  # Fat proportion
        'water_content': np.random.uniform(0.1, 0.9, n_samples),  # Water content
        'fiber_content': np.random.uniform(0.01, 0.3, n_samples),  # Fiber content
        'food_type': np.random.choice([0, 1, 2], n_samples)  # 0=fruit, 1=vegetable, 2=protein
    }
    
    df = pd.DataFrame(data)
    
    # Calculate calories based on nutrition science: 4*carb + 4*protein + 9*fat
    # Add some randomness to make it realistic
    df['calories'] = (
        (df['weight_grams'] * df['carb_density'] * 4) +
        (df['weight_grams'] * df['protein_density'] * 4) + 
        (df['weight_grams'] * df['fat_density'] * 9) +
        np.random.normal(0, 20, n_samples)  # Random variation
    )
    
    return df

# Create and explore dataset
food_data = create_food_dataset()
print("=== FOOD NUTRITION DATASET ===")
print(f"Dataset shape: {food_data.shape}")
print("\nFirst 5 rows:")
print(food_data.head())
print(f"\nAverage calories: {food_data['calories'].mean():.1f}")

# Prepare features and target
X = food_data.drop('calories', axis=1)
y = food_data['calories']
feature_names = X.columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Method 1: Linear Regression
print("\n" + "="*50)
print("METHOD 1: LINEAR REGRESSION")
print("="*50)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test)

print(f"R² Score: {r2_score(y_test, y_pred_lr):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lr):.1f}")

# Show coefficients (feature importance)
print("\nFeature Importance (Linear Regression):")
lr_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': lr_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print(lr_importance)

# Method 2: Random Forest
print("\n" + "="*50)
print("METHOD 2: RANDOM FOREST")
print("="*50)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print(f"R² Score: {r2_score(y_test, y_pred_rf):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_rf):.1f}")

# Feature importance
print("\nFeature Importance (Random Forest):")
rf_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(rf_importance)

# Compare models
print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'R² Score': [r2_score(y_test, y_pred_lr), r2_score(y_test, y_pred_rf)],
    'MSE': [mean_squared_error(y_test, y_pred_lr), mean_squared_error(y_test, y_pred_rf)]
})

print(comparison)

# Make predictions on new food
print("\n" + "="*50)
print("MAKING PREDICTIONS")
print("="*50)

# Example: Predict calories for a new food item
new_food = np.array([[200, 0.6, 0.1, 0.05, 0.7, 0.1, 0]])  # Fruit with high carbs
new_food_df = pd.DataFrame(new_food, columns=feature_names)

calories_pred = rf_model.predict(new_food_df)
print(f"Predicted calories: {calories_pred[0]:.1f}")

# Real-world interpretation
print("\n" + "="*50)
print("NUTRITION INSIGHTS")
print("="*50)

print("Key findings from our models:")
print("1. Weight is the strongest predictor of calories (makes sense!)")
print("2. Fat density has high impact (9 calories per gram vs 4 for carbs/protein)")
print("3. Food type helps refine predictions")
print("4. Water content reduces calorie density")

# Visualization
plt.figure(figsize=(12, 4))

# Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Calories')
plt.ylabel('Predicted Calories')
plt.title('Random Forest: Actual vs Predicted')

# Feature Importance
plt.subplot(1, 2, 2)
plt.barh(rf_importance['Feature'], rf_importance['Importance'])
plt.xlabel('Feature Importance')
plt.title('What Affects Calories Most?')
plt.tight_layout()
# Visualization
plt.figure(figsize=(12, 4))

# Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Calories')
plt.ylabel('Predicted Calories')
plt.title('Random Forest: Actual vs Predicted')

# Feature Importance
plt.subplot(1, 2, 2)
plt.barh(rf_importance['Feature'], rf_importance['Importance'])
plt.xlabel('Feature Importance')
plt.title('What Affects Calories Most?')
plt.tight_layout()

# Save instead of show
plt.savefig('nutrition_analysis.png', dpi=300, bbox_inches='tight')
print("Plot saved as 'nutrition_analysis.png'")

=== FOOD NUTRITION DATASET ===
Dataset shape: (500, 8)

First 5 rows:
   weight_grams  carb_density  protein_density  fat_density  water_content  \
0           152      0.127184         0.098691     0.488959       0.890620   
1           485      0.312286         0.085278     0.108421       0.454618   
2           398      0.475958         0.139605     0.349665       0.138265   
3           320      0.328656         0.304133     0.412506       0.130417   
4           156      0.679508         0.257537     0.030180       0.696966   

   fiber_content  food_type     calories  
0       0.147544          1   787.903419  
1       0.181352          0  1227.895125  
2       0.165732          2  2231.133086  
3       0.257699          0  1983.672291  
4       0.024354          0   640.729918  

Average calories: 1386.8

METHOD 1: LINEAR REGRESSION
R² Score: 0.939
MSE: 37843.4

Feature Importance (Linear Regression):
           Feature  Coefficient
3      fat_density  2529.856028
2  protein_den

 Project 2: Food Classification using Decision Trees
Classify foods into categories based on nutrition

In [3]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score, classification_report

# Create food classification dataset
def create_food_classification_data():
    np.random.seed(42)
    n_samples = 300
    
    data = []
    for i in range(n_samples):
        food_type = np.random.choice(['fruit', 'vegetable', 'protein', 'grain'])
        
        if food_type == 'fruit':
            carbs = np.random.uniform(0.4, 0.8)
            protein = np.random.uniform(0.01, 0.1)
            fat = np.random.uniform(0.01, 0.2)
            fiber = np.random.uniform(0.1, 0.3)
        elif food_type == 'vegetable':
            carbs = np.random.uniform(0.2, 0.6)
            protein = np.random.uniform(0.05, 0.3)
            fat = np.random.uniform(0.01, 0.1)
            fiber = np.random.uniform(0.2, 0.5)
        elif food_type == 'protein':
            carbs = np.random.uniform(0.01, 0.2)
            protein = np.random.uniform(0.4, 0.8)
            fat = np.random.uniform(0.1, 0.5)
            fiber = np.random.uniform(0.01, 0.1)
        else:  # grain
            carbs = np.random.uniform(0.6, 0.8)
            protein = np.random.uniform(0.1, 0.2)
            fat = np.random.uniform(0.01, 0.1)
            fiber = np.random.uniform(0.05, 0.2)
        
        data.append({
            'carb_density': carbs,
            'protein_density': protein, 
            'fat_density': fat,
            'fiber_density': fiber,
            'food_class': food_type
        })
    
    return pd.DataFrame(data)

# Create dataset
food_class_data = create_food_classification_data()
print("=== FOOD CLASSIFICATION DATASET ===")
print(food_class_data.head())
print(f"\nClass distribution:")
print(food_class_data['food_class'].value_counts())

# Prepare data
X_class = food_class_data.drop('food_class', axis=1)
y_class = food_class_data['food_class']
feature_names_class = X_class.columns.tolist()

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Decision Tree Classifier
print("\n" + "="*50)
print("DECISION TREE CLASSIFICATION")
print("="*50)

dt_model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=4,
    random_state=42
)

dt_model.fit(X_train_c, y_train_c)

# Evaluate
y_pred_dt = dt_model.predict(X_test_c)
accuracy = accuracy_score(y_test_c, y_pred_dt)

print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test_c, y_pred_dt))

# Display the tree rules
print("\n" + "="*50)
print("DECISION TREE RULES")
print("="*50)

tree_rules = export_text(dt_model, feature_names=feature_names_class)
print("How the tree classifies foods:")
print(tree_rules)

# Feature importance
print("\nFeature Importance:")
dt_importance = pd.DataFrame({
    'Feature': feature_names_class,
    'Importance': dt_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(dt_importance)

# Test classification on new foods
print("\n" + "="*50)
print("CLASSIFYING NEW FOODS")
print("="*50)

test_foods = [
    [0.7, 0.05, 0.1, 0.2],   # High carb, low protein → probably fruit
    [0.2, 0.6, 0.1, 0.05],   # High protein → probably protein
    [0.3, 0.2, 0.05, 0.4],   # High fiber → probably vegetable
    [0.65, 0.15, 0.05, 0.1]  # High carb, medium protein → probably grain
]

food_descriptions = ["Sweet fruit-like", "Protein-rich", "Fibrous vegetable", "Grain-like"]

for i, food in enumerate(test_foods):
    prediction = dt_model.predict([food])[0]
    probability = np.max(dt_model.predict_proba([food]))
    
    print(f"{food_descriptions[i]}: {prediction} (confidence: {probability:.2f})")

# Nutrition insights from the tree
print("\n" + "="*50)
print("NUTRITION PATTERNS LEARNED")
print("="*50)

print("The decision tree learned these nutrition patterns:")
print(" High protein + low carbs = Protein foods")
print(" High carbs + low protein = Fruits/Grains") 
print(" High fiber + medium protein = Vegetables")
print(" Carbs vs Protein ratio determines food category")

=== FOOD CLASSIFICATION DATASET ===
   carb_density  protein_density  fat_density  fiber_density food_class
0      0.161343         0.473374     0.411876       0.063717    protein
1      0.262398         0.064521     0.087956       0.380335  vegetable
2      0.730178         0.105641     0.074980       0.190783      grain
3      0.272730         0.095851     0.037382       0.357427  vegetable
4      0.604612         0.152477     0.045987       0.057000      grain

Class distribution:
food_class
vegetable    86
fruit        75
protein      74
grain        65
Name: count, dtype: int64

DECISION TREE CLASSIFICATION
Accuracy: 0.983

Classification Report:
              precision    recall  f1-score   support

       fruit       1.00      1.00      1.00        15
       grain       0.93      1.00      0.96        13
     protein       1.00      1.00      1.00        15
   vegetable       1.00      0.94      0.97        17

    accuracy                           0.98        60
   macro avg  



In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Create freshness dataset
def create_freshness_data():
    np.random.seed(42)
    n_samples = 400
    
    data = []
    for i in range(n_samples):
        # Simulate food characteristics that affect freshness
        days_old = np.random.randint(1, 14)
        temperature = np.random.uniform(1, 25)  # Storage temperature
        humidity = np.random.uniform(30, 95)    # Humidity level
        packaging = np.random.choice([0, 1])    # 0=unpackaged, 1=packaged
        
        # Calculate freshness probability
        freshness_prob = (
            0.6 - (days_old * 0.05) +           # Older = less fresh
            (25 - temperature) * 0.02 +         # Cooler = more fresh  
            (1 - humidity/100) * 0.1 +          # Lower humidity = more fresh
            packaging * 0.15                    # Packaged = more fresh
        )
        
        # Add some randomness and ensure valid probability
        freshness_prob += np.random.normal(0, 0.1)
        freshness_prob = max(0.1, min(0.9, freshness_prob))
        
        # Determine if fresh (1) or spoiled (0)
        is_fresh = 1 if freshness_prob > 0.5 else 0
        
        data.append({
            'days_old': days_old,
            'temperature': temperature,
            'humidity': humidity,
            'packaging': packaging,
            'is_fresh': is_fresh
        })
    
    return pd.DataFrame(data)

# Create dataset
freshness_data = create_freshness_data()
print("=== FOOD FRESHNESS DATASET ===")
print(freshness_data.head())
print(f"\nFresh vs Spoiled distribution:")
print(freshness_data['is_fresh'].value_counts())
print(f"Fresh rate: {freshness_data['is_fresh'].mean():.2f}")

# Prepare data
X_fresh = freshness_data.drop('is_fresh', axis=1)
y_fresh = freshness_data['is_fresh']
feature_names_fresh = X_fresh.columns.tolist()

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_fresh, y_fresh, test_size=0.2, random_state=42, stratify=y_fresh
)

# Compare multiple models
print("\n" + "="*50)
print("COMPARING FRESHNESS PREDICTION MODELS")
print("="*50)

models = {
    'Decision Tree': DecisionTreeClassifier(max_depth=4, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train_f, y_train_f)
    y_pred = model.predict(X_test_f)
    accuracy = accuracy_score(y_test_f, y_pred)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy
    })
    
    print(f"{name}: {accuracy:.3f}")

results_df = pd.DataFrame(results)
print(f"\nBest model: {results_df.loc[results_df['Accuracy'].idxmax(), 'Model']}")

# Use the best model (Random Forest typically)
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train_f, y_train_f)

print("\n" + "="*50)
print("FRESHNESS PREDICTION INSIGHTS")
print("="*50)

# Feature importance
freshness_importance = pd.DataFrame({
    'Factor': feature_names_fresh,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("What affects food freshness most?")
print(freshness_importance)

# Make predictions
print("\n" + "="*50)
print("PRACTICAL FRESHNESS PREDICTIONS")
print("="*50)

test_scenarios = [
    [2, 4, 40, 1],   # Fresh: 2 days old, refrigerated, low humidity, packaged
    [7, 20, 80, 0],  # Risky: 1 week, room temp, high humidity, unpackaged
    [10, 25, 90, 0], # Spoiled: old, hot, humid, unpackaged
    [1, 2, 30, 1]    # Very fresh: new, cold, dry, packaged
]

scenario_descs = [
    "New packaged food in fridge",
    "Week-old unpackaged at room temperature", 
    "Old food in hot humid conditions",
    "Very fresh ideal conditions"
]

for i, scenario in enumerate(test_scenarios):
    prediction = best_model.predict([scenario])[0]
    probability = best_model.predict_proba([scenario])[0][1]  # Probability of being fresh
    
    status = " FRESH" if prediction == 1 else " SPOILED"
    
    print(f"{scenario_descs[i]}:")
    print(f"  Prediction: {status}")
    print(f"  Freshness probability: {probability:.1%}")
    print()

# SDG Impact Analysis
print("\n" + "="*50)
print("SDG IMPACT: REDUCING FOOD WASTE")
print("="*50)

print("""
By accurately predicting food freshness, we can:

SDG 2: ZERO HUNGER
Reduce food waste by 20-30%
Extend shelf life through better storage
Improve food distribution efficiency

SDG 12: RESPONSIBLE CONSUMPTION
 Prevent premature disposal of edible food
 Optimize inventory management
 Promote sustainable food practices

Estimated impact: A 25% reduction in household food waste
could feed millions more people while reducing environmental impact.
""")

=== FOOD FRESHNESS DATASET ===
   days_old  temperature   humidity  packaging  is_fresh
0         7    20.117032  41.923261          1         0
1         3     3.399398  59.851178          0         1
2         4     4.428804  72.307751          0         1
3         5    15.819556  69.757455          0         1
4        12     1.553498  64.110353          0         0

Fresh vs Spoiled distribution:
is_fresh
1    266
0    134
Name: count, dtype: int64
Fresh rate: 0.67

COMPARING FRESHNESS PREDICTION MODELS
Decision Tree: 0.825
Random Forest: 0.800
Logistic Regression: 0.875

Best model: Logistic Regression

FRESHNESS PREDICTION INSIGHTS
What affects food freshness most?
        Factor  Importance
0     days_old    0.448190
1  temperature    0.349613
2     humidity    0.173860
3    packaging    0.028337

PRACTICAL FRESHNESS PREDICTIONS
New packaged food in fridge:
  Prediction:  FRESH
  Freshness probability: 100.0%

Week-old unpackaged at room temperature:
  Prediction:  SPOILED
  Fr

