In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
df = pd.read_csv("/content/drive/MyDrive/water_intake.csv")
df.head()

Unnamed: 0,Age,Gender,Weight_kg,Height_cm,Temperature_C,Humidity,Activity_Level,Exercise_Duration_min,Health_Condition,Water_Intake_L
0,56,0,69,151,30,69,Active,30,Healthy,3.0
1,69,0,118,192,31,83,Active,78,Healthy,4.0
2,46,1,90,150,33,34,Active,57,Chronic Issues,5.0
3,32,1,102,200,32,49,Sedentary,29,Healthy,3.0
4,60,0,91,185,25,69,Active,110,Healthy,5.0


In [None]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    500 non-null    int64  
 1   Gender                 500 non-null    int64  
 2   Weight_kg              500 non-null    int64  
 3   Height_cm              500 non-null    int64  
 4   Temperature_C          500 non-null    int64  
 5   Humidity               500 non-null    int64  
 6   Activity_Level         500 non-null    object 
 7   Exercise_Duration_min  500 non-null    int64  
 8   Health_Condition       500 non-null    object 
 9   Water_Intake_L         500 non-null    float64
dtypes: float64(1), int64(7), object(2)
memory usage: 39.2+ KB
None


In [None]:
# Create BMI
df["BMI"] = df["Weight_kg"] / ((df["Height_cm"] / 100) ** 2)

# Add interaction features
df["BMI_Temp"] = df["BMI"] * df["Temperature_C"]
df["Exercise_Humidity"] = df["Exercise_Duration_min"] * df["Humidity"]


In [None]:
num_cols = ["Age", "Weight_kg", "Height_cm", "Temperature_C", "Humidity", "Exercise_Duration_min", "BMI", "BMI_Temp", "Exercise_Humidity"]
cat_cols = ["Gender", "Activity_Level", "Health_Condition"]

df[num_cols] = SimpleImputer(strategy="median").fit_transform(df[num_cols])
df[cat_cols] = SimpleImputer(strategy="most_frequent").fit_transform(df[cat_cols])


In [None]:
print(df.columns.tolist())


['Age', 'Gender', 'Weight_kg', 'Height_cm', 'Temperature_C', 'Humidity', 'Activity_Level', 'Exercise_Duration_min', 'Health_Condition', 'Water_Intake_L', 'BMI', 'BMI_Temp', 'Exercise_Humidity']


In [None]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [None]:
selected_features = [
    "Exercise_Humidity",
    "Age",
    "Exercise_Duration_min",
    "Humidity",
    "Activity_Level_Very Active"
]
X = df[selected_features]
y = df["Water_Intake_L"]


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
param_grid = {
    "kernel": ["rbf", "linear", "poly"],
    "C": [0.1, 1, 10],
    "epsilon": [0.1, 0.5, 1]
}

grid = GridSearchCV(SVR(), param_grid, cv=5, scoring="r2")
grid.fit(X_train, y_train)

print("Best R²:", grid.best_score_)
print("Best Params:", grid.best_params_)


Best R²: -0.11366108769725294
Best Params: {'C': 1, 'epsilon': 1, 'kernel': 'rbf'}


In [None]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

RMSE: 4.39
MAE: 2.88
R² Score: -0.17


In [None]:
import joblib
joblib.dump(best_model,"svr_model.pkl")
from google.colab import files
files.download("svr_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>