
---
---

# 1. Setup

---
---

### Import Packages

##### Import the general packages

In [None]:
from google.colab import drive
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter

##### Install & import the tsfresh libraries

In [None]:
!pip install tsfresh

Collecting tsfresh
[?25l  Downloading https://files.pythonhosted.org/packages/35/b7/cbbfb02d50a93dbb710a730f168711eb343829e1cdea9f0d001d91aeefd6/tsfresh-0.17.0-py2.py3-none-any.whl (91kB)
[K     |███▋                            | 10kB 15.9MB/s eta 0:00:01[K     |███████▏                        | 20kB 21.0MB/s eta 0:00:01[K     |██████████▉                     | 30kB 22.9MB/s eta 0:00:01[K     |██████████████▍                 | 40kB 12.1MB/s eta 0:00:01[K     |██████████████████              | 51kB 7.7MB/s eta 0:00:01[K     |█████████████████████▋          | 61kB 7.8MB/s eta 0:00:01[K     |█████████████████████████▏      | 71kB 7.8MB/s eta 0:00:01[K     |████████████████████████████▉   | 81kB 8.6MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 3.9MB/s 
Collecting distributed>=2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/38/d9f0e31c15de18cb124d1ed33cf9c99c84f05f251ff6767e7573c217725b/distributed-2.30.1-py3-none-any.whl (656kB)
[K

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_features

  import pandas.util.testing as tm


##### Import and setup the Json package for loading in and storing data

In [None]:
import json

class JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, 'to_json'):
            return obj.to_json(orient='records')
        return json.JSONEncoder.default(self, obj)

### Set the maximum number of rows and column in the outputs

In [None]:
pd.set_option('max_rows', 25)

### Mount the drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Set the directory for reading the data

In [None]:
os.chdir('/content/drive/My Drive/4th Year/CA4015/assignment2/Submission/data')    # Ivan's directory
# os.chdir('/content/drive/My Drive/assignment2/Submission/data')                     # Nathan's directory

---
---

# 2. Read in the data

---
---

### Read in the dictionary data from the Json file

In [None]:
json_df_data = json.load(open('map_of_subject_id_to_its_cleaned_sensor_data.json'))
# for the no step data
# json_df_data = json.load(open('no_step_map_of_subject_id_to_its_cleaned_sensor_data.json'))

In [None]:
id_to_df_map = {}
for subject_id, sensor_df in tqdm(json_df_data.items()):
    id_to_df_map[subject_id] = pd.read_json(json_df_data[subject_id])

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




### Manually inspect the read in data

In [None]:
i = 0
for student_id, df in id_to_df_map.items():
    print("--------------------", student_id, "--------------------")
    print(df)

    # only output 3 subjects and their sensor data
    if i == 2:
        break
    i = i + 1

-------------------- 46343 --------------------
       second    x_move    y_move    z_move  heart_rate  step_count  psg_status
0         390 -0.540527  0.680496 -0.271774        90.0   79.820000           0
1         391 -0.426881  0.920799 -0.093921        90.0  109.617059           0
2         392 -0.448256  0.816442 -0.228871        90.0  125.870000           0
3         393 -0.464479  0.765035 -0.432477        90.0  125.870000           0
4         394 -0.494360  0.798880 -0.076882        90.0  125.870000           0
...       ...       ...       ...       ...         ...         ...         ...
16556   16946 -0.441763 -0.525671  0.723509        73.0    0.000000           0
16557   16947 -0.441267 -0.525272  0.724310        73.0    0.000000           0
16558   16948 -0.441876 -0.525352  0.724083        73.0    0.000000           0
16559   16949 -0.442227 -0.525543  0.723603        73.0    0.000000           0
16560   16950 -0.441905 -0.525534  0.723882        73.0    0.000000     

---
---

# 3. Get the data into the correct format for extracting the features

---
---

### Add an 'id' column to each row specifying which bin of seconds each row falls in

In [None]:
bin_size = 30

In [None]:
map_subject_to_df_with_id = {}
for subject_id, fixed_sensor_df in tqdm(id_to_df_map.items()):

    print("---------------", subject_id, "-----------------")

    print(fixed_sensor_df.shape)
  	# dropna's
    no_nans_fixed_sensor_df = fixed_sensor_df.dropna()
    print(no_nans_fixed_sensor_df.shape)

    # get the value of the maximum second in this dataframe
    max_second_in_df = int(round(max(no_nans_fixed_sensor_df.second) + 0.5))

    # create a new dataframe that we will populate
    new_df = pd.DataFrame(columns=(list(no_nans_fixed_sensor_df.columns).extend(["session_id"])))

    session_number = 0
    # iterate through each second interval in this dataframe
    for i in np.arange(0, max_second_in_df + bin_size, bin_size):

        # get the rows between second "i - 1" and second "i"
        rows_in_session_df = pd.DataFrame(no_nans_fixed_sensor_df.loc[(no_nans_fixed_sensor_df.second >= (i)) & (no_nans_fixed_sensor_df.second < i + bin_size)])
        
        if not rows_in_session_df.empty:
            # assign the session_id label to this row
            rows_in_session_df['session_id'] = session_number

            # join these rows to the rest of the rows
            new_df = pd.concat([new_df, rows_in_session_df], axis=0)

            session_number += 1

    map_subject_to_df_with_id[subject_id] = new_df

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

--------------- 46343 -----------------
(16561, 7)
(16561, 7)
--------------- 759667 -----------------
(14184, 7)
(14184, 7)
--------------- 781756 -----------------
(29369, 7)
(29369, 7)
--------------- 844359 -----------------
(26881, 7)
(26881, 7)
--------------- 1066528 -----------------
(28389, 7)
(28389, 7)
--------------- 1360686 -----------------
(27695, 7)
(27695, 7)
--------------- 1449548 -----------------
(28561, 7)
(28561, 7)
--------------- 1455390 -----------------
(28621, 7)
(28621, 7)
--------------- 1818471 -----------------
(28711, 7)
(28711, 7)
--------------- 2598705 -----------------
(28591, 7)
(28591, 7)
--------------- 2638030 -----------------
(28411, 7)
(28411, 7)
--------------- 3509524 -----------------
(12448, 7)
(12448, 7)
--------------- 3997827 -----------------
(28711, 7)
(28711, 7)
--------------- 4018081 -----------------
(14940, 7)
(14940, 7)
--------------- 4314139 -----------------
(28801, 7)
(28801, 7)
--------------- 4426783 -----------------
(29

##### Have a look at the values in the 'id' column

In [None]:
for sub_id, df in map_subject_to_df_with_id.items():
    seen = {}
    for v in df.session_id:

        if v in seen:
            seen[v] += 1
            
        else:
            seen[v] = 1
            
    print(seen)

{0: 30, 1: 30, 2: 30, 3: 30, 4: 30, 5: 30, 6: 30, 7: 30, 8: 30, 9: 30, 10: 30, 11: 30, 12: 30, 13: 30, 14: 30, 15: 30, 16: 30, 17: 30, 18: 30, 19: 30, 20: 30, 21: 30, 22: 30, 23: 30, 24: 30, 25: 30, 26: 30, 27: 30, 28: 30, 29: 30, 30: 30, 31: 30, 32: 30, 33: 30, 34: 30, 35: 30, 36: 30, 37: 30, 38: 30, 39: 30, 40: 30, 41: 30, 42: 30, 43: 30, 44: 30, 45: 30, 46: 30, 47: 30, 48: 30, 49: 30, 50: 30, 51: 30, 52: 30, 53: 30, 54: 30, 55: 30, 56: 30, 57: 30, 58: 30, 59: 30, 60: 30, 61: 30, 62: 30, 63: 30, 64: 30, 65: 30, 66: 30, 67: 30, 68: 30, 69: 30, 70: 30, 71: 30, 72: 30, 73: 30, 74: 30, 75: 30, 76: 30, 77: 30, 78: 30, 79: 30, 80: 30, 81: 30, 82: 30, 83: 30, 84: 30, 85: 30, 86: 30, 87: 30, 88: 30, 89: 30, 90: 30, 91: 30, 92: 30, 93: 30, 94: 30, 95: 30, 96: 30, 97: 30, 98: 30, 99: 30, 100: 30, 101: 30, 102: 30, 103: 30, 104: 30, 105: 30, 106: 30, 107: 30, 108: 30, 109: 30, 110: 30, 111: 30, 112: 30, 113: 30, 114: 30, 115: 30, 116: 30, 117: 30, 118: 30, 119: 30, 120: 30, 121: 30, 122: 30, 12

---
---

# 4. Create a map of 'session_id' to 'psg_status' for each student_id

---
---

### For each subject, create a map between each of their session_ids and that sessions psg_status

In [None]:
map_subject_id_to_a_map_of_the_session_id_to_psg_status = {}

for subject_id, sensor_df in tqdm(map_subject_to_df_with_id.items()):

    # for this subject, create a dictionary to map their sessions to their psg status'
    subjects_session_to_psg_map = {}

    for session_id in list(set(sensor_df.session_id)):
    
        # get all id entries in df where psg_status = sleep_state
        all_psg_status = sensor_df[sensor_df['session_id'] == session_id]['psg_status']

        # get the most common psg_status across all rows with this session_id
        most_common_psg_status = Counter(all_psg_status).most_common(1)[0][0]

        # create an entry in the subject dictionary of a map between the session_id and the most common psg_status
        subjects_session_to_psg_map[session_id] = most_common_psg_status

    # add this subjects dictionaries to the map of each subject_id to their dictionaries
    map_subject_id_to_a_map_of_the_session_id_to_psg_status[subject_id] = subjects_session_to_psg_map

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




##### Manually inspect this dictionary

In [None]:
for sub_id, id_to_psg_dict in map_subject_id_to_a_map_of_the_session_id_to_psg_status.items():

    # print the subject_id
    print("----------------", sub_id, "---------------------")
    
    # iterate through the dictionary and print the values
    for session_id, psg_status in id_to_psg_dict.items():
        print(session_id, "->", psg_status)

    # stop after one subject_id
    break

---------------- 46343 ---------------------
0 -> 0
1 -> 0
2 -> 0
3 -> 0
4 -> 0
5 -> 0
6 -> 0
7 -> 0
8 -> 0
9 -> 0
10 -> 0
11 -> 0
12 -> 0
13 -> 0
14 -> 0
15 -> 0
16 -> 0
17 -> 0
18 -> 0
19 -> 0
20 -> 0
21 -> 0
22 -> 0
23 -> 0
24 -> 0
25 -> 0
26 -> 0
27 -> 0
28 -> 0
29 -> 0
30 -> 0
31 -> 0
32 -> 0
33 -> 0
34 -> 1
35 -> 1
36 -> 1
37 -> 1
38 -> 2
39 -> 2
40 -> 2
41 -> 2
42 -> 2
43 -> 2
44 -> 2
45 -> 2
46 -> 2
47 -> 2
48 -> 2
49 -> 2
50 -> 2
51 -> 2
52 -> 2
53 -> 2
54 -> 2
55 -> 2
56 -> 3
57 -> 3
58 -> 3
59 -> 3
60 -> 3
61 -> 3
62 -> 3
63 -> 3
64 -> 3
65 -> 3
66 -> 3
67 -> 3
68 -> 3
69 -> 3
70 -> 3
71 -> 3
72 -> 3
73 -> 3
74 -> 3
75 -> 3
76 -> 3
77 -> 3
78 -> 3
79 -> 3
80 -> 3
81 -> 3
82 -> 3
83 -> 3
84 -> 3
85 -> 3
86 -> 3
87 -> 3
88 -> 3
89 -> 3
90 -> 3
91 -> 3
92 -> 3
93 -> 3
94 -> 3
95 -> 3
96 -> 3
97 -> 3
98 -> 3
99 -> 3
100 -> 3
101 -> 3
102 -> 3
103 -> 3
104 -> 3
105 -> 3
106 -> 3
107 -> 3
108 -> 3
109 -> 3
110 -> 0
111 -> 0
112 -> 0
113 -> 0
114 -> 1
115 -> 2
116 -> 2
117 -> 2
118

---
---

# 5. Extract the features 

---
---

### Extract the features for each sensor

In [None]:
map_id_to_extracted_features = {}
for subject_id, cleaned_sensor_df in tqdm(map_subject_to_df_with_id.items()):

    print("---------------------------------------------------")
    print("======================", subject_id, "======================")
    print("---------------------------------------------------")

    no_psg_status_cleaned_df = cleaned_sensor_df.drop(columns=["psg_status"], axis=1).dropna()

    no_psg_status_cleaned_df["session_id"] = no_psg_status_cleaned_df["session_id"].astype(str)

    extracted_features = extract_features(no_psg_status_cleaned_df, column_value=None, column_sort="second", column_id="session_id")

    print(extracted_features.shape)
    extracted_features = extracted_features.dropna(axis='columns')
    
    print(extracted_features.shape)

    map_id_to_extracted_features[subject_id] = extracted_features

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [02:36<00:00, 31.23s/it]


(553, 3895)
(553, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [02:12<00:00, 26.44s/it]


(474, 3895)
(474, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:36<00:00, 55.28s/it]


(980, 3895)
(980, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:12<00:00, 50.43s/it]


(897, 3895)
(897, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:13<00:00, 50.61s/it]


(947, 3895)
(947, 1755)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:17<00:00, 51.46s/it]


(925, 3895)
(925, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:18<00:00, 51.80s/it]


(953, 3895)
(953, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:27<00:00, 53.42s/it]


(955, 3895)
(955, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:28<00:00, 53.66s/it]


(958, 3895)
(958, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:14<00:00, 50.85s/it]


(954, 3895)
(954, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:24<00:00, 52.95s/it]


(948, 3895)
(948, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [01:57<00:00, 23.41s/it]


(416, 3895)
(416, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:27<00:00, 53.55s/it]


(958, 3895)
(958, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [02:18<00:00, 27.70s/it]


(499, 3895)
(499, 1138)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:27<00:00, 53.45s/it]


(961, 3895)
(961, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:33<00:00, 54.66s/it]


(979, 3895)
(979, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [02:09<00:00, 25.94s/it]


(464, 3895)
(464, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:32<00:00, 54.59s/it]


(977, 3895)
(977, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [03:27<00:00, 41.57s/it]


(744, 3895)
(744, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:23<00:00, 52.72s/it]


(939, 3895)
(939, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:25<00:00, 53.16s/it]


(954, 3895)
(954, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [00:33<00:00,  6.75s/it]


(123, 3895)
(123, 1395)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:26<00:00, 53.32s/it]


(958, 3895)
(958, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:25<00:00, 53.05s/it]


(955, 3895)
(955, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:30<00:00, 54.09s/it]


(971, 3895)
(971, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:20<00:00, 52.17s/it]


(949, 3895)
(949, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:25<00:00, 53.18s/it]


(955, 3895)
(955, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:21<00:00, 52.28s/it]


(935, 3895)
(935, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:28<00:00, 53.75s/it]


(960, 3895)
(960, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [04:23<00:00, 52.64s/it]


(944, 3895)
(944, 1058)
---------------------------------------------------
---------------------------------------------------


Feature Extraction: 100%|██████████| 5/5 [03:21<00:00, 40.22s/it]


(719, 3895)
(719, 1058)



### Store the extracted features in a dictionary

In [None]:
# put the dataframe in the json file
with open('map_subject_id_to_its_unfiltered_extracted_features_df.json', 'w') as fp:
    json.dump(map_id_to_extracted_features, fp, cls=JSONEncoder)

In [None]:
# for the no step data
# with open('no_step_map_subject_id_to_its_unfiltered_extracted_features_df.json', 'w') as fp:
#    json.dump(map_id_to_extracted_features, fp, cls=JSONEncoder)

---
---

# 6. Select the most relevant features from all of these extracted features

---
---

In [None]:
map_id_to_filtered_extracted_features = {}
for subject_id, extracted_features_df in tqdm(map_id_to_extracted_features.items()):

    print("---------------------------------------------------")
    print("======================", subject_id, "======================")
    print("---------------------------------------------------")
    
    map_of_session_id_to_psg_status = map_subject_id_to_a_map_of_the_session_id_to_psg_status[subject_id]

    target_array = np.array(list(map_of_session_id_to_psg_status.values()))

    print(extracted_features_df.shape)
    features_filtered = select_features(extracted_features_df, target_array)
    print(features_filtered.shape)

    map_id_to_filtered_extracted_features[subject_id] = features_filtered

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

---------------------------------------------------
---------------------------------------------------
(553, 1058)
(553, 357)
---------------------------------------------------
---------------------------------------------------
(474, 1058)
(474, 385)
---------------------------------------------------
---------------------------------------------------
(980, 1058)
(980, 270)
---------------------------------------------------
---------------------------------------------------
(897, 1058)
(897, 362)
---------------------------------------------------
---------------------------------------------------
(947, 1755)
(947, 653)
---------------------------------------------------
---------------------------------------------------
(925, 1058)
(925, 263)
---------------------------------------------------
---------------------------------------------------
(953, 1058)
(953, 395)
---------------------------------------------------
---------------------------------------------------
(955, 1

---
---

# 7. Store this data in a json file

---
---

### Store the extracted features dictionary in a Json file

In [None]:
# put the dataframe in the json file
with open('map_subject_id_to_its_filtered_extracted_features_df.json', 'w') as fp:
    json.dump(map_id_to_filtered_extracted_features, fp, cls=JSONEncoder)
# for the no step data
# with open('no_step_map_subject_id_to_its_filtered_extracted_features_df.json', 'w') as fp:
#     json.dump(map_id_to_filtered_extracted_features, fp, cls=JSONEncoder)

### Store the psg_status map to all of the sessions with that status in a Json file

In [None]:
# put the dataframe in the json file
with open('map_subject_id_to_a_map_of_the_session_id_to_psg_status.json', 'w') as fp:
    json.dump(map_subject_id_to_a_map_of_the_session_id_to_psg_status, fp, cls=JSONEncoder)