In [31]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [32]:
train.insert(4, 'Height(Inches)', train.apply(lambda x: int(x['Height(Feet)']) * 12 + int(x['Height(Remainder_Inches)']), axis=1))
test.insert(4, 'Height(Inches)', test.apply(lambda x: int(x['Height(Feet)']) * 12 + int(x['Height(Remainder_Inches)']), axis=1))
train = train.drop(['ID', 'Height(Feet)', 'Height(Remainder_Inches)'], axis=1)
test = test.drop(['ID', 'Height(Feet)', 'Height(Remainder_Inches)'], axis=1)

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Exercise_Duration    7500 non-null   int64  
 1   Body_Temperature(F)  7500 non-null   float64
 2   BPM                  7500 non-null   int64  
 3   Height(Inches)       7500 non-null   int64  
 4   Weight(lb)           7500 non-null   float64
 5   Weight_Status        7500 non-null   object 
 6   Gender               7500 non-null   object 
 7   Age                  7500 non-null   int64  
 8   Calories_Burned      7500 non-null   int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 527.5+ KB


In [34]:
def remove_outliers(data, threshold=3):
    z_scores = np.abs((data - np.mean(data)) / np.std(data))
    return data[z_scores < threshold]

numeric_features = train.columns[:5]
for feature in numeric_features:
    train[feature] = remove_outliers(train[feature])


In [35]:
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7449 entries, 0 to 7499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Exercise_Duration    7449 non-null   int64  
 1   Body_Temperature(F)  7449 non-null   float64
 2   BPM                  7449 non-null   float64
 3   Height(Inches)       7449 non-null   float64
 4   Weight(lb)           7449 non-null   float64
 5   Weight_Status        7449 non-null   object 
 6   Gender               7449 non-null   object 
 7   Age                  7449 non-null   int64  
 8   Calories_Burned      7449 non-null   int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 582.0+ KB


In [36]:
ordinal_features = ['Weight_Status', 'Gender']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고윳값을 확인후 test 데이터를 변환합니다.
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [37]:
# train_0 = train.loc[train['Weight_Status'] == 0]
# train_1 = train.loc[train['Weight_Status'] == 1]
# train_2 = train.loc[train['Weight_Status'] == 2]
# counts = [len(train_0), len(train_1), len(train_2)]
# train_0 = train_0.sample(n=max(counts), replace=True)
# train_1 = train_1.sample(n=max(counts), replace=True)
# train_2 = train_2.sample(n=max(counts), replace=True)
# train_filled = pd.concat([train_0, train_1, train_2], axis=0)
# train_filled = train_filled.sample(frac=1)

In [38]:
# train_filled['Weight_Status'].value_counts()

In [39]:
# 독립변수로 설정할 train_x에서는 종속변수를 제거합니다. 또한 분석에 활용하지 않는 ID 데이터를 제거합니다.
train_x = train.drop(['Calories_Burned'], axis = 1)
# train_y 변수를 종속변수로 사용하기 위해 Calories_Burned 데이터를 지정하였습니다.
train_y = train['Calories_Burned']

# train_x 데이터와 마찬가지로 분석에 활용하지 않는 ID 데이터를 제거합니다.
test_x = test

In [40]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7449 entries, 0 to 7499
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Exercise_Duration    7449 non-null   int64  
 1   Body_Temperature(F)  7449 non-null   float64
 2   BPM                  7449 non-null   float64
 3   Height(Inches)       7449 non-null   float64
 4   Weight(lb)           7449 non-null   float64
 5   Weight_Status        7449 non-null   int32  
 6   Gender               7449 non-null   int32  
 7   Age                  7449 non-null   int64  
dtypes: float64(4), int32(2), int64(2)
memory usage: 465.6 KB


In [41]:
from flaml import AutoML
automl = AutoML()
automl.fit(train_x, train_y, task="regression", ensemble=True, early_stop=True, time_budget=60*30, seed=42)

[flaml.automl.logger: 04-11 17:04:45] {1768} INFO - task = regression
[flaml.automl.logger: 04-11 17:04:45] {1775} INFO - Data split method: uniform
[flaml.automl.logger: 04-11 17:04:45] {1778} INFO - Evaluation method: cv
[flaml.automl.logger: 04-11 17:04:45] {1891} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 04-11 17:04:45] {2011} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 04-11 17:04:45] {2341} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-11 17:04:45] {2480} INFO - Estimated sufficient time budget=1200s. Estimated necessary time budget=10s.
[flaml.automl.logger: 04-11 17:04:45] {2532} INFO -  at 0.3s,	estimator lgbm's best error=0.4959,	best estimator lgbm's best error=0.4959
[flaml.automl.logger: 04-11 17:04:45] {2341} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-11 17:04:45] {2532} INFO -  at 0.4s,	estimator lgbm's best error=0.4959,	b

In [42]:
preds = automl.predict(test_x)

In [43]:
submission['Calories_Burned'] = preds
submission.to_csv('./submit.csv', index = False)