In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from feature_engine.outliers import OutlierTrimmer

In [2]:
df = pd.read_excel('data.xlsx')
df

Unnamed: 0,CGM_Data,avgCGM,carbs,calories,fiber,fat,protein,Hour
0,118.8,146.25,14.83,79.80,0.81,1.00,2.66,18
1,154.8,150.30,31.90,520.68,7.73,28.68,37.52,18
2,154.8,225.90,77.15,581.48,7.74,19.97,24.31,8
3,109.8,145.35,46.56,456.75,7.36,20.04,25.46,12
4,156.6,171.90,58.57,496.23,6.01,13.53,32.29,16
...,...,...,...,...,...,...,...,...
944,144.0,151.65,20.92,123.30,2.21,1.65,6.60,15
945,165.6,167.85,68.32,587.59,15.25,19.27,36.81,17
946,124.2,158.85,98.60,655.59,6.72,15.23,29.19,7
947,122.4,134.55,15.69,65.27,3.21,0.56,1.22,9


- Define independant and dependant features
- Split training and testing data

In [3]:
x = df[['carbs', 'calories', 'fiber', 'fat', 'protein', 'CGM_Data', 'Hour']]
y = df['avgCGM']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Scale the features using standard scaling

In [4]:
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)

In [26]:
# df = df.dropna()

In [27]:
# df.to_excel('data.xlsx', index=False)

In [4]:
regression = LinearRegression()

In [5]:
regression.fit(x_train, y_train)

In [6]:
y_pred = regression.predict(x_test)

In [7]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = regression.score(x_test, y_test)

In [8]:
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R-squared:', r_squared)

Mean Squared Error: 1511.9340378022098
Root Mean Squared Error: 38.883595998855476
R-squared: 0.39700563843758563


## Identify and remove any outliers in the data

In [13]:
# residuals = y_train - regression.predict(x_train)

# mean_residuals = np.mean(residuals)
# std_residuals = np.std(residuals)

# outliers = np.abs(residuals - mean_residuals) > 2 * std_residuals

# x_train = x_train[~outliers]
# y_train = y_train[~outliers]

In [14]:
# regression.fit(x_train, y_train)
# y_pred = regression.predict(x_test)

In [None]:
trimmer = OutlierTrimmer(capping_method="iqr", tail="both", fold=1.5, variables=["my_variable"], missing_values="raise", outlier_percentage=0.05)

In [15]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = regression.score(x_test, y_test)

In [16]:
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R-squared:', r_squared)

Mean Squared Error: 1550.4965743936311
Root Mean Squared Error: 39.37634536614122
R-squared: 0.38162600443849093


## Predicting the CGM value using an array input

In [22]:
input_row = [[21, 477, 4.7, 26, 40, 163.8, 17]]

In [23]:
predicted_avgCGM = regression.predict(input_row)



In [24]:
print("Predicted avgCGM:", predicted_avgCGM)

Predicted avgCGM: [171.86063513]


## Export the model to a file

In [28]:
joblib.dump(regression, 'linear_regression_model.joblib')

['linear_regression_model.joblib']

In [17]:
! pip3 install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting statsmodels>=0.11.1
  Downloading statsmodels-0.13.5-cp310-cp310-macosx_11_0_arm64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: patsy, statsmodels, feature-engine
Successfully installed feature-engine-1.5.2 patsy-0.5.3 statsmodels-0.13.5
