In [None]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows',None,'display.max_columns',None,'display.max_colwidth',800)

## Upload Data

In [None]:
pdfile = '/content/garmin.csv'
df = pd.read_csv(pdfile)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Activity Type             60 non-null     object 
 1   Date                      60 non-null     object 
 2   Favorite                  60 non-null     bool   
 3   Title                     60 non-null     object 
 4   Distance                  60 non-null     float64
 5   Calories                  60 non-null     object 
 6   Time                      60 non-null     object 
 7   Avg HR                    60 non-null     int64  
 8   Max HR                    60 non-null     int64  
 9   Aerobic TE                60 non-null     float64
 10  Avg Run Cadence           60 non-null     int64  
 11  Max Run Cadence           60 non-null     int64  
 12  Avg Pace                  60 non-null     object 
 13  Best Pace                 60 non-null     object 
 14  Total Ascent

In [None]:
import pandas as pd

# Mengubah kolom 'Activity Date' menjadi tipe data datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y %H:%M', errors='coerce')

# Menentukan tanggal awal dan akhir
end_date = pd.to_datetime('2025-03-16')  # Tanggal akhir
start_date = end_date - pd.DateOffset(months=6)  # Tanggal awal (6 bulan sebelum tanggal akhir)

# Memfilter data
filtered_df = df[
    (df['Activity Type'].isin(['Running', 'Treadmill'])) &
    ((df['Date'] >= start_date) & (df['Date'] < end_date))
]

print(filtered_df.head())
filtered_df.info()

Empty DataFrame
Columns: [Activity Type, Date, Favorite, Title, Distance, Calories, Time, Avg HR, Max HR, Aerobic TE, Avg Run Cadence, Max Run Cadence, Avg Pace, Best Pace, Total Ascent, Total Descent, Avg Stride Length, Avg Vertical Ratio, Avg Vertical Oscillation, Avg Ground Contact Time, Avg GAP, Normalized Power® (NP®), Training Stress Score®, Avg Power, Max Power, Steps, Min Temp, Decompression, Best Lap Time, Number of Laps, Max Temp, Moving Time, Elapsed Time, Min Elevation, Max Elevation]
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Activity Type             0 non-null      object        
 1   Date                      0 non-null      datetime64[ns]
 2   Favorite                  0 non-null      bool          
 3   Title                     0 non-null      object        
 4   Distance               

In [None]:
print(filtered_df.describe(include='all'))

       Activity Type                           Date Favorite  \
count            107                            107      107   
unique             1                            NaN        1   
top          Running                            NaN    False   
freq             107                            NaN      107   
mean             NaN  2024-12-14 06:24:22.990654208      NaN   
min              NaN            2024-09-16 06:04:00      NaN   
25%              NaN            2024-10-28 12:33:00      NaN   
50%              NaN            2024-12-18 06:27:00      NaN   
75%              NaN            2025-01-24 12:20:00      NaN   
max              NaN            2025-03-14 08:34:00      NaN   
std              NaN                            NaN      NaN   

                    Title Distance Calories      Time      Avg HR      Max HR  \
count                 107      107      107       107  107.000000  107.000000   
unique                  8       87      103       106         NaN    

In [None]:
print(filtered_df['Avg Pace'].unique())
print(filtered_df['Best Pace'].unique())

['05:15' '05:42' '05:04' '05:31' '04:58' '05:14' '05:28' '04:40' '05:33'
 '04:37' '04:57' '05:56' '05:06' '04:59' '05:54' '05:00' '05:13' '05:19'
 '05:02' '06:05' '04:55' '05:18' '05:08' '05:47' '05:21' '05:50' '04:51'
 '05:46' '05:27' '05:25' '06:07' '05:57' '05:39' '05:24' '06:01' '06:15'
 '04:24' '05:30' '05:48' '06:12' '05:59' '05:16' '05:43' '04:42' '05:53'
 '05:41' '05:26' '06:14' '06:22' '06:33' '06:06' '06:46' '06:50' '05:51'
 '06:30' '05:29' '07:02' '05:10' '05:55' '05:12' '06:00' '06:24' '05:36'
 '05:44' '06:47' '06:32' '08:45' '06:45' '06:10' '05:45' '04:10' '03:56'
 '03:53' '06:25' '06:26' '07:19' '06:36' '03:31' '06:29']
['04:20' '04:02' '03:44' '04:11' '04:09' '04:19' '04:10' '03:53' '03:30'
 '03:29' '04:00' '04:15' '04:29' '04:47' '04:13' '04:04' '03:38' '04:49'
 '03:56' '03:45' '03:50' '04:35' '03:52' '03:16' '04:06' '02:30' '05:05'
 '04:01' '02:39' '05:19' '04:08' '03:04' '03:55' '02:50' '03:49' '02:20'
 '04:33' '05:16' '03:09' '03:28' '05:08' '04:38' '03:32' '03:51' '

In [None]:
rows_to_drop = filtered_df[(filtered_df['Avg Pace'] == '--') | (filtered_df['Best Pace'] == '--')].index

filtered_df = filtered_df.drop(rows_to_drop)
filtered_df = filtered_df.reset_index(drop=True)

def speed_km(pace_str):
  pace_split = pace_str.str.split(':', expand=True).astype(float)
  pace_minutes = pace_split[0] + (pace_split[1] / 60)
  speed_km = 60 / pace_minutes
  return speed_km

filtered_df['Average Speed'] = speed_km(filtered_df['Avg Pace'])
filtered_df['Max Speed'] = speed_km(filtered_df['Best Pace'])
filtered_df['Average Speed'] = round(filtered_df['Average Speed'], 2)
filtered_df['Max Speed'] = round(filtered_df['Max Speed'], 2)
filtered_df[['Avg Pace', 'Average Speed']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Avg Pace       107 non-null    object 
 1   Average Speed  107 non-null    float64
dtypes: float64(1), object(1)
memory usage: 1.8+ KB


## Feature selected

In [None]:
feature_selected = ['Date',
                    'Avg Pace',
                    'Best Pace',
                    'Average Speed',
                    'Max Speed',
                    'Avg HR',
                    'Max HR',
                    'Aerobic TE',
                    'Avg Run Cadence',
                    'Total Ascent',
                    'Total Descent',
                    'Moving Time',
                    'Elapsed Time',
                    'Distance']

filtered_df = filtered_df[feature_selected].reset_index(drop = True)
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             107 non-null    datetime64[ns]
 1   Avg Pace         107 non-null    object        
 2   Best Pace        107 non-null    object        
 3   Average Speed    107 non-null    float64       
 4   Max Speed        107 non-null    float64       
 5   Avg HR           107 non-null    int64         
 6   Max HR           107 non-null    int64         
 7   Aerobic TE       107 non-null    float64       
 8   Avg Run Cadence  107 non-null    object        
 9   Total Ascent     107 non-null    object        
 10  Total Descent    107 non-null    object        
 11  Moving Time      107 non-null    object        
 12  Elapsed Time     107 non-null    object        
 13  Distance         107 non-null    object        
dtypes: datetime64[ns](1), float64(3), int64(2)

In [None]:
filtered_df['Total Descent'] = filtered_df['Total Descent'].replace('--', 0)
filtered_df['Total Ascent'] = filtered_df['Total Ascent'].replace('--', 0)

filtered_df['Total Descent'] = filtered_df['Total Descent'].astype(float)
filtered_df['Total Ascent'] = filtered_df['Total Ascent'].astype(float)

filtered_df['Aerobic TE'] = filtered_df['Aerobic TE'].replace('--', 0)
filtered_df['Aerobic TE'] = filtered_df['Aerobic TE'].astype(float)

In [None]:
def parse_time_to_seconds(time_str):
  if isinstance(time_str, str):
      try:
          # Coba format hh:mm:ss atau mm:ss
          parts = time_str.split(':')
          if len(parts) == 3: # hh:mm:ss
              hours, minutes, seconds = map(float, parts)
              return int(hours * 3600 + minutes * 60 + seconds)
          elif len(parts) == 2: # mm:ss or mm:ss.decimal
               # Pisahkan detik dan desimal jika ada
              if '.' in parts[1]:
                  seconds_parts = parts[1].split('.')
                  seconds = float(seconds_parts[0]) + float('0.' + seconds_parts[1])
              else:
                  seconds = float(parts[1])
              minutes = float(parts[0])
              return int(minutes * 60 + seconds)
      except ValueError:
          return None
  return

In [None]:
filtered_df['Elapsed Time'] = filtered_df['Elapsed Time'].apply(parse_time_to_seconds).astype('Int64')
filtered_df['Moving Time'] = filtered_df['Moving Time'].apply(parse_time_to_seconds).astype('Int64')

In [None]:
filtered_df['Average Speed'] = pd.to_numeric(filtered_df['Average Speed'], errors='coerce')
filtered_df['Max Speed'] = pd.to_numeric(filtered_df['Max Speed'], errors='coerce')
filtered_df['Avg HR'] = pd.to_numeric(filtered_df['Avg HR'], errors='coerce')
filtered_df['Max HR'] = pd.to_numeric(filtered_df['Max HR'], errors='coerce')
filtered_df['Avg Run Cadence'] = pd.to_numeric(filtered_df['Avg Run Cadence'], errors='coerce')
filtered_df['Total Ascent'] = pd.to_numeric(filtered_df['Total Ascent'], errors='coerce')
filtered_df['Total Descent'] = pd.to_numeric(filtered_df['Total Descent'], errors='coerce')
filtered_df['Moving Time'] = pd.to_numeric(filtered_df['Moving Time'], errors='coerce')
filtered_df['Elapsed Time'] = pd.to_numeric(filtered_df['Elapsed Time'], errors='coerce')
filtered_df['Distance'] = pd.to_numeric(filtered_df['Distance'], errors='coerce')
filtered_df['Aerobic TE'] = pd.to_numeric(filtered_df['Aerobic TE'], errors='coerce')

In [None]:
def filter_feature_aerobic(filtered_df):
    if filtered_df['Aerobic TE'] <= 3.9:
        return 'Low Aerobic'
    elif (filtered_df['Aerobic TE'] >= 4) and (filtered_df['Aerobic TE'] < 5):
        return 'High Aerobic'
    elif filtered_df['Aerobic TE'] == 5:
        return 'Anaerobic'
    else:
        None

In [None]:
smy_new = filtered_df.copy()
smy_new['Aerobic TE'] = smy_new.apply(lambda x:filter_feature_aerobic(x),axis=1)
smy_new.head()

Unnamed: 0,Date,Avg Pace,Best Pace,Average Speed,Max Speed,Avg HR,Max HR,Aerobic TE,Avg Run Cadence,Total Ascent,Total Descent,Moving Time,Elapsed Time,Distance
0,2025-02-15 05:27:00,05:15,04:20,11.43,13.85,168,189,Anaerobic,176,78.0,120.0,10695,11015,34.33
1,2025-02-08 05:38:00,05:42,04:02,10.53,14.88,158,179,High Aerobic,174,38.0,84.0,10652,10853,31.56
2,2025-02-01 05:19:00,05:04,03:44,11.84,16.07,165,184,Anaerobic,177,81.0,75.0,9397,10306,31.08
3,2025-01-25 05:11:00,05:31,04:11,10.88,14.34,163,181,Anaerobic,176,38.0,82.0,10202,10275,31.01
4,2025-02-22 05:33:00,04:58,04:09,12.08,14.46,167,183,Anaerobic,181,116.0,112.0,8994,9000,30.25


In [None]:
aggregated_data = filtered_df.agg({
    'Date' : 'count',
    'Average Speed' : 'mean',  # Rata-rata Avg Pace
    'Max Speed': 'max',   # Pace terbaik (Max)
    'Avg HR': 'mean',    # Rata-rata Avg HR
    'Max HR': 'max',     # HR maksimum
    'Avg Run Cadence': 'mean',  # Rata-rata Avg Run Cadence
    'Total Ascent': 'sum',   # Total Ascent
    'Total Descent': 'sum',   # Total Descent
    'Moving Time': 'sum',    # Total Moving Time
    'Elapsed Time': 'sum',    # Total Elapsed Time
    'Distance': 'sum',     # Total Distance
   })

aerobic_counts = smy_new['Aerobic TE'].value_counts()
total_activities = aerobic_counts.sum()  # Total number of activities

aggregated_data['Low Aerobic (%)'] = (aerobic_counts.get('Low Aerobic', 0) / total_activities) * 100
aggregated_data['High Aerobic (%)'] = (aerobic_counts.get('High Aerobic', 0) / total_activities) * 100
aggregated_data['Anaerobic (%)'] = (aerobic_counts.get('Anaerobic', 0) / total_activities) * 100

aggregated_data = aggregated_data.round(2)
aggregated_data = pd.DataFrame(aggregated_data).T
aggregated_data['Max Distance'] = filtered_df['Distance'].max()
aggregated_data

Unnamed: 0,Date,Average Speed,Max Speed,Avg HR,Max HR,Avg Run Cadence,Total Ascent,Total Descent,Moving Time,Elapsed Time,Distance,Low Aerobic (%),High Aerobic (%),Anaerobic (%),Max Distance
0,107.0,10.87,25.71,158.9,201.0,174.01,2829.0,3438.0,405800.0,413571.0,1239.29,63.55,24.3,12.15,34.33


## Masuk data baru setelah proses agregat

In [None]:
pdfile = '/content/smy_garmin.csv'
agregat = pd.read_csv(pdfile)
agregat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Activity              100 non-null    int64  
 1   Average Speed (km/h)  100 non-null    float64
 2   Max Speed (km/h)      100 non-null    float64
 3   Avg HR                100 non-null    float64
 4   Max HR                100 non-null    int64  
 5   Avg Run Cadence       100 non-null    float64
 6   Total Ascent          100 non-null    int64  
 7   Total Descent         100 non-null    int64  
 8   Moving Time           100 non-null    int64  
 9   Elapsed Time          100 non-null    int64  
 10  Distance              100 non-null    float64
 11  Low Aerobic (%)       100 non-null    float64
 12  High Aerobic (%)      100 non-null    float64
 13  Anaerobic (%)         100 non-null    float64
 14  Max Distance          100 non-null    float64
 15  Marathon Time (s)     10

In [None]:
X = agregat[["Activity",
             "Average Speed (km/h)",
             "Max Speed (km/h)",
             "Avg HR",
             "Max HR",
             "Avg Run Cadence",
             "Total Ascent",
             "Total Descent",
             "Moving Time",
             "Elapsed Time",
             "Distance",
             "Max Distance",
             "Low Aerobic (%)",
             "High Aerobic (%)",
             "Anaerobic (%)",
             "Age",
             "Weight",
             "Gender"
              ]]
y = agregat['Marathon Time (s)'] #actual Time FM

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # Inisialisasi MinMaxScaler dengan rentang 0 dan 1
# scaler = MinMaxScaler(feature_range=(0, 1))

# X_scaled = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# # Tampilkan deskripsi data yang sudah diskalakan
# print(X_scaled.describe())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Activity,Average Speed (km/h),Max Speed (km/h),Avg HR,Max HR,Avg Run Cadence,Total Ascent,Total Descent,Moving Time,Elapsed Time,Distance,Max Distance,Low Aerobic (%),High Aerobic (%),Anaerobic (%),Age,Weight,Gender
55,115,10.28,14.4,152.0,175,180.13,4619,4683,490108,489122,1750.0,29.0,75.0,23.0,2.0,29,49.0,0
88,110,11.18,33.03,139.66,196,179.98,3890,4178,393766,398436,1266.38,42.54,79.09,10.0,10.91,43,60.0,1
26,144,11.09,16.66,154.0,181,177.17,5706,5774,612033,612294,2197.0,38.0,77.0,15.0,8.0,33,61.5,1
42,144,11.48,16.13,151.0,180,174.85,5839,5889,612653,612540,2199.0,34.0,80.0,13.0,7.0,29,63.0,1
69,115,10.04,13.82,152.0,175,180.78,4526,4627,490075,490358,1764.0,26.0,80.0,14.0,6.0,29,48.0,0


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score, KFold
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
cv_rmse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error')
cv_mae_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')


print(f"Cross-Validation Results ({n_splits}-Fold):")
print(f"R2: {cv_r2_scores.mean():.4f} (+/- {cv_r2_scores.std():.4f})")
print(f"RMSE: {cv_rmse_scores.mean():.4f} detik (+/- {cv_rmse_scores.std():.4f})")
print(f"MAE: {cv_mae_scores.mean():.4f} detik (+/- {cv_mae_scores.std():.4f})")

Cross-Validation Results (10-Fold):
R2: 0.8255 (+/- 0.0811)
RMSE: 538.7749 detik (+/- 154.4320)
MAE: 435.1682 detik (+/- 124.6905)


In [None]:
from sklearn.feature_selection import RFE

# Inisialisasi RFE dengan model dan jumlah fitur yang ingin dipilih (misalnya 5 fitur terbaik)
rfe = RFE(model, n_features_to_select=4)

# Fit RFE dengan data
rfe.fit(X_train, y_train)

selected_features = X.columns[rfe.support_]
print("Fitur terbaik:", selected_features)

Fitur terbaik: Index(['Low Aerobic (%)', 'High Aerobic (%)', 'Anaerobic (%)', 'Gender'], dtype='object')


In [None]:
import pickle
pdfile = '/content/linreg_new.pkl'
with open(pdfile, 'wb') as file:
    pickle.dump(linreg, file)