In [1]:
import sys
sys.path.insert(0, r'P:\pet ML')
import pandas as pd
import numpy as np
import sqlite3
from work_with_db import work_with_db
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from data_preprocessing import preprocessing

In [2]:
db = work_with_db()
prep = preprocessing()

In [3]:
prep.train_test_data()

           Sex  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  Calories
0         male   36   189.0    82.0      26.0       101.0       41.0     150.0
1       female   64   163.0    60.0       8.0        85.0       39.7      34.0
2       female   51   161.0    64.0       7.0        84.0       39.8      29.0
3         male   20   192.0    90.0      25.0       105.0       40.7     140.0
4       female   38   166.0    61.0      25.0       102.0       40.6     146.0
...        ...  ...     ...     ...       ...         ...        ...       ...
749995    male   28   193.0    97.0      30.0       114.0       40.9     230.0
749996  female   64   165.0    63.0      18.0        92.0       40.5      96.0
749997    male   60   162.0    67.0      29.0       113.0       40.9     221.0
749998    male   45   182.0    91.0      17.0       102.0       40.3     109.0
749999  female   39   171.0    65.0      19.0        97.0       40.6     103.0

[750000 rows x 8 columns]


In [4]:
df = pd.DataFrame(db.action_with_db(action='SELECT', columns=['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']), columns=['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp'])
target = pd.DataFrame(db.action_with_db(action='SELECT', columns=['id', 'Calories'], close=True), columns=['id', 'Calories'])

In [5]:
train_target = target[target['Calories'].notna() == True]
test_target = target[target['Calories'].notna() == False]
display(train_target.shape)
display(test_target.shape)

(750000, 2)

(250000, 2)

In [6]:
train_data = df.merge(train_target, on='id')
test_data = df.merge(test_target, on='id')

In [7]:
target = train_data.iloc[:, -1]
train_data = train_data.drop(columns=['Calories', 'id'])

In [8]:
train_data.columns.tolist()

['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [9]:
cat_cols = train_data.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
display(cat_cols)
display(num_cols)

['Sex']

['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [10]:
prep = ColumnTransformer(transformers=[
    ('cat_cols', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_cols', StandardScaler(), num_cols)
])

In [11]:
train_data = prep.fit_transform(train_data)
display(train_data)

array([[ 0.        ,  1.        , -0.3571921 , ...,  1.2663241 ,
         0.58371421,  1.23577241],
       [ 1.        ,  0.        ,  1.48794322, ..., -0.88830917,
        -1.10943632, -0.43116345],
       [ 1.        ,  0.        ,  0.63127325, ..., -1.00801102,
        -1.21525823, -0.30293761],
       ...,
       [ 0.        ,  1.        ,  1.22435246, ...,  1.62542965,
         1.85357711,  1.10754658],
       [ 0.        ,  1.        ,  0.23588711, ...,  0.18900746,
         0.68953612,  0.33819156],
       [ 1.        ,  0.        , -0.15949903, ...,  0.42841116,
         0.16042658,  0.72286907]], shape=(750000, 8))

In [12]:
test_data = prep.transform(test_data)
display(test_data)

array([[ 0.        ,  1.        ,  0.23588711, ..., -1.00801102,
        -0.8977925 , -0.30293761],
       [ 0.        ,  1.        , -1.016169  , ...,  0.54811301,
         0.58371421,  0.59464323],
       [ 1.        ,  0.        , -0.81847593, ...,  0.06930561,
         0.68953612,  0.4664174 ],
       ...,
       [ 1.        ,  0.        , -0.68668055, ..., -0.17009808,
         0.26624849,  0.08173989],
       [ 1.        ,  0.        ,  1.35614784, ...,  1.14662225,
         1.11282375,  0.8510949 ],
       [ 0.        ,  1.        , -0.68668055, ..., -0.05039623,
         0.26624849, -0.17471178]], shape=(250000, 8))