# Описание данных:

Данные содержат информацию о показателях здоровья, собранных с умных часов.

### Структура данных:

| Название столбца            | Описание                                        |
|-----------------------------|-------------------------------------------------|
| **User ID**                 | Уникальный идентификатор пользователя           |
| **Heart Rate (BPM)**        | Частота сердечных сокращений (ударов в минуту)  |
| **Blood Oxygen Level (%)**  | Уровень насыщения крови кислородом              |
| **Step Count**              | Количество шагов за период                      |
| **Sleep Duration (hours)**  | Продолжительность сна в часах                   |
| **Activity Level**          | Уровень активности пользователя                 |
| **Stress Level**            | Уровень стресса (шкала от 1 до 10)              |

# Подключение библиотек

In [1]:
import pandas as pd  

# Чтение и вывод данных

In [2]:
df = pd.read_csv("data/unclean_smartwatch_health_data.csv")
df

Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,4174.0,58.939776,98.809650,5450.390578,7.167235622316564,Highly Active,1
1,,,98.532195,727.601610,6.538239375570314,Highly_Active,5
2,1860.0,247.803052,97.052954,2826.521994,ERROR,Highly Active,5
3,2294.0,40.000000,96.894213,13797.338044,7.367789630207228,Actve,3
4,2130.0,61.950165,98.583797,15679.067648,,Highly_Active,6
...,...,...,...,...,...,...,...
9995,1524.0,78.819386,98.931927,2948.491953,7.402748595032027,Active,7
9996,4879.0,48.632659,95.773035,4725.623070,6.3821659358529015,Sedentary,2
9997,2624.0,73.834442,97.945874,2571.492060,6.91654920303435,Sedentary,4
9998,4907.0,,98.401058,3364.788855,5.691233932149209,Active,8


# Группировка данных

In [3]:
df_copy = df.copy()

### Подсчет количества отсутствующих значений

In [4]:
df.isnull().sum()

User ID                   201
Heart Rate (BPM)          400
Blood Oxygen Level (%)    300
Step Count                100
Sleep Duration (hours)    150
Activity Level            200
Stress Level              200
dtype: int64

### Группировка данных

In [5]:
df_copy['Activity Level'] = df_copy['Activity Level'].str.replace('Highly_Active', 'Highly Active')
df_copy['Activity Level'] = df_copy['Activity Level'].str.replace('Seddentary', 'Sedentary')
df_copy['Activity Level'] = df_copy['Activity Level'].str.replace('Actve', 'Active')

### Разбиение числовых данных

### Столбец Heart Rate (BPM)

In [6]:
df_copy['Heart Rate (BPM)'] = pd.to_numeric(df_copy['Heart Rate (BPM)'], errors='coerce')
df_copy['Heart Rate (BPM)'] = pd.cut(
    df['Heart Rate (BPM)'], 
    bins=[0, 50, 60, 70, 80 , 90, 100, 200, 300],
    labels=['0-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100-200', '200+'],
    right=False
)

### Столбец Blood Oxygen Level (%)

In [7]:
df_copy['Blood Oxygen Level (%)'] = pd.to_numeric(df['Blood Oxygen Level (%)'], errors='coerce')
df_copy['Blood Oxygen Level (%)'] = pd.cut(
    df['Blood Oxygen Level (%)'],
    bins = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100],
    labels = ['90-91', '91-92', '92-93', '93-94', '94-95', '95-96', '96-97', '97-98', '98-99', '99-100'],
    right = False
)

### Столбец Step Count

In [8]:
df_copy['Step Count'] = pd.to_numeric(df['Step Count'], errors='coerce')
df_copy['Step Count'] = pd.cut(
    df['Step Count'],
    bins = [0, 2000, 4000, 6000, 8000, 10000, 1000000],
    labels = ['0-2000', '2000-4000', '4000-6000', '6000-8000', '8000-10000', '10000+'],
    right = False
)

### Столбец Sleep Duration (hours)

In [9]:
df_copy['Sleep Duration (hours)'] = pd.to_numeric(df_copy['Sleep Duration (hours)'], errors='coerce')
mean_sleep = df_copy['Sleep Duration (hours)'].mean()
df_copy['Sleep Duration (hours)'] = df_copy['Sleep Duration (hours)'].fillna(mean_sleep)
df_copy['Sleep Duration (hours)'] = pd.cut(
    df_copy['Sleep Duration (hours)'],  
    bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100],
    labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+'],
    right=False
)

### Просмотр уникальные значений в каждом столбце

In [10]:
for column in df_copy.columns:
    value_counts = df_copy[column].value_counts(dropna = False) 
    print(value_counts)
    print("-" * 50)

User ID
NaN       201
3773.0     10
3708.0     10
4854.0      9
1684.0      9
         ... 
2025.0      1
4518.0      1
2108.0      1
2420.0      1
1466.0      1
Name: count, Length: 3635, dtype: int64
--------------------------------------------------
Heart Rate (BPM)
70-80      2481
60-70      2045
80-90      2013
50-60      1080
90-100     1044
100-200     469
0-50        418
NaN         400
200+         50
Name: count, dtype: int64
--------------------------------------------------
Blood Oxygen Level (%)
97-98     1830
NaN       1826
98-99     1825
99-100    1519
96-97     1459
95-96      895
94-95      434
93-94      163
92-93       37
91-92       11
90-91        1
Name: count, dtype: int64
--------------------------------------------------
Step Count
0-2000        2455
10000+        2391
2000-4000     1793
4000-6000     1422
6000-8000     1044
8000-10000     795
NaN            100
Name: count, dtype: int64
--------------------------------------------------
Sleep Duration (hours)
