In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [3]:
# Load your updated dataset
df = pd.read_csv("correlated_water_intake.csv")
# Display first few rows
df.head()


Unnamed: 0,Age,Gender,Weight_kg,Height_cm,Temperature_C,Humidity,Activity_Level,Exercise_Duration_min,Health_Condition,Water_Intake_L
0,56,0,69,151,30,69,Active,30,Healthy,2.21
1,69,0,118,192,31,83,Active,78,Healthy,3.18
2,46,1,90,150,33,34,Active,57,Chronic Issues,2.48
3,32,1,102,200,32,49,Sedentary,29,Healthy,2.39
4,60,0,91,185,25,69,Active,110,Healthy,3.29


In [5]:
print(df.isnull().sum())
X = df.drop("Water_Intake_L", axis=1)
y = df["Water_Intake_L"]


Age                      0
Gender                   0
Weight_kg                0
Height_cm                0
Temperature_C            0
Humidity                 0
Activity_Level           0
Exercise_Duration_min    0
Health_Condition         0
Water_Intake_L           0
dtype: int64


In [7]:
categorical_cols = ["Activity_Level", "Health_Condition"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]


In [9]:
# Define preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# Train the model
lr_model.fit(X_train, y_train)


In [13]:
y_pred = lr_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")


R² Score: 0.661
MAE: 0.603
RMSE: 0.894


In [15]:
import joblib
joblib.dump(lr_model, "lr_intake.pkl")


['lr_intake.pkl']

In [19]:
from IPython.display import FileLink
FileLink('lr_intake.pkl')


In [17]:
# Example input
sample = pd.DataFrame({
    'Age': [28],
    'Gender': [0],
    'Weight_kg': [70],
    'Height_cm': [175],
    'Temperature_C': [33],
    'Humidity': [60],
    'Activity_Level': ['Active'],
    'Exercise_Duration_min': [60],
    'Health_Condition': ['Healthy']
})

predicted_intake = lr_model.predict(sample)
print(f"Predicted Daily Water Intake: {predicted_intake[0]:.2f} L")


Predicted Daily Water Intake: 2.63 L
