# Minimum reproducible example: Kernel drop

The purpose of this script is to provide a minimal reproducible example of the issue I keep facing. The issue is that the Python 3 kernel in my Jupyter notebook inconsistently needs restarting.

### Imports.

In [1]:
try:
    import EntropyHub
except:
    !pip install EntropyHub
    import EntropyHub
import itertools
import numpy
import pandas
try:
    import pyinform
except:
    !pip install pyinform
    import pyinform
import random
import time
import tqdm
import warnings

### Define functions.

In [2]:
def chaoticlifeentropyfs(pt_timeline):
    '''
    There are two categories of entropy-based feature sets for both appointments and did-not-attends:
    Sequential
    1.	activeInformation
    2.	entropyRate
    Summative
    3.	spectralEntropy
    4.	sampleEntropy
    5.	eoe (entropy of entropy)
    6.	averageEntropy
    7.	bubbleEntropy
    Use the following parameters for all summative entropy statistics other than spectral entropy, which doesn't require them:
    -	obs = three-monthly count, enough to amass a period of use.
    -	window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    -	window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    '''
    
    
    
    # Set parameters.
    # ## Set warnings parameter to handle divide-by-zero issues with spectral entropy.
    warnings.filterwarnings("error")
    # ## Window breath ("embedding dimension") = 4, to indicate a year's worth of appointments.
    embeddingDimension = 4
    # ## Window shift ("embedding time delay") = 1, to be sensitive to quarterly changes in behaviour.
    embeddingTimeDelay = 1
    # ## Length of the patient's timeline.
    len_timeline = len(pt_timeline)
    # Convert pt_timeline into a numpy.array.
    pt_timeline = numpy.array(pt_timeline)
    
    # activeInformation
    # ...
    if len_timeline <= embeddingDimension:
        activeInformation = None
    else:
        try:
            activeInformation = \
                pyinform.activeinfo.active_info(pt_timeline, k = embeddingDimension)
        except:
            activeInformation = None
    
    # entropyRate
    # ...
    if len_timeline <= embeddingDimension:
        entropyRate = None
    else:
        try:
            entropyRate = \
                pyinform.entropyrate.entropy_rate(pt_timeline, k = embeddingDimension)
        except:
            entropyRate = None
    
    # spectralEntropy
    # ...
    if len_timeline <= 10:
        spectralEntropy = None
    else:
        try:
            spectralEntropy, _ = EntropyHub.SpecEn(pt_timeline)
        except RuntimeWarning:
            spectralEntropy = None
    
    # sampleEntropy
    # ...
    if len_timeline <= 10:
        sampleEntropy = None
    else:
        try:
            sampleEntropy, _, _ = \
                EntropyHub.SampEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
            sampleEntropy = sampleEntropy[-1]
        except:
            sampleEntropy = None

    # eoe and averageEntropy
    # ...
    if len_timeline <= 10:
        eoe = None
        averageEntropy = None
    else:
        try:
            eoe, averageEntropy, _ = \
                EntropyHub.EnofEn(pt_timeline, tau = embeddingDimension, S = math.floor(len_timeline / 4) )
        except:
            eoe = None
            averageEntropy = None

    # bubbleEntropy
    # ...
    if len_timeline <= 10:
        bubbleEntropy = None
    else:
        try:
            bubbleEntropy, _ = EntropyHub.BubbEn(pt_timeline, m = embeddingDimension, tau = embeddingTimeDelay)
            bubbleEntropy = bubbleEntropy[-1]
        except:
            bubbleEntropy = None
    
    # Package the output.
    ls_entropyBasedFeatureSets = \
        [
        activeInformation
        ,entropyRate
        ,spectralEntropy
        ,sampleEntropy
        ,eoe
        ,averageEntropy
        ,bubbleEntropy
        ]
    
    return ls_entropyBasedFeatureSets

### Manufacture data.

In [3]:
# Set count of patients.
n_patient = 200000

# Set earliest and latest permitted years.
year_min = 2013
year_max = 2023

# Set maximum permitted count of appointments per quarter.
max_appts_per_qtr = 10

# Set `quarters` constant.
quarters = list(range(1,4+1))

# Make data.
for i_patient in range(n_patient):
    # Randomly set patient's earliest and latest year.
    pt_year_min = random.randint(year_min, year_max)
    pt_year_max = random.randint(pt_year_min, year_max)
    
    # Create patient's timeline of years and quarters.
    pt_yearline = list(range(pt_year_min, pt_year_max+1))
    pt_timeline = []
    for i_year in range(len(pt_yearline)):
        for i_quarter in range(len(quarters)):
            pt_timeline.append( ( i_patient+1, pt_yearline[i_year],
                                 quarters[i_quarter], random.randint(0, max_appts_per_qtr+1) ) )

    # Make or append to `bq_countAppointmentsPerQuarter` and `bq_countDNAsPerQuarter` pandas.DataFrames.
    # Note: countDNAsPerQuarter must be less than countAppointmentsPerQuarter.
    if i_patient == 0:
        bq_countAppointmentsPerQuarter = \
            pandas.DataFrame(pt_timeline, columns = ['person_id', 'year_appointment',
                                                     'quarter_appointment', 'countAppointmentsPerQuarter'])

        bq_countDNAsPerQuarter = \
            pandas.DataFrame(pt_timeline, columns = ['person_id', 'year_DNA', 'quarter_DNA', 'countDNAsPerQuarter'])
        for i_row in range(len(bq_countDNAsPerQuarter)):
            bq_countDNAsPerQuarter.countDNAsPerQuarter[i_row] = random.randint(0, bq_countDNAsPerQuarter.countDNAsPerQuarter[i_row])
    else:
        bq_countAppointmentsPerQuarter = \
            bq_countAppointmentsPerQuarter.append(
                pandas.DataFrame(pt_timeline, columns = ['person_id', 'year_appointment',
                                                         'quarter_appointment', 'countAppointmentsPerQuarter'])
            )
        
        temp_bq_countDNAsPerQuarter = \
            pandas.DataFrame(pt_timeline, columns = ['person_id', 'year_DNA', 'quarter_DNA', 'countDNAsPerQuarter'])
        for i_row in range(len(temp_bq_countDNAsPerQuarter)):
            temp_bq_countDNAsPerQuarter.countDNAsPerQuarter[i_row] = random.randint(0, temp_bq_countDNAsPerQuarter.countDNAsPerQuarter[i_row])
        bq_countDNAsPerQuarter = bq_countDNAsPerQuarter.append(temp_bq_countDNAsPerQuarter)

### Run problem code.

The code cell below is lifted directly from UNSEEN_create_feature_sets.ipynb.

In [4]:
########################
# The FOR loop version.#
########################

# Set iterator.
ls_pids = list(set(numpy.concatenate((bq_countAppointmentsPerQuarter.person_id.unique(), bq_countDNAsPerQuarter.person_id.unique()))))
ls_pids.sort()

# Set storage.
ls_entropyBased_fs_appts = [['person_id', 'activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]
ls_entropyBased_fs_DNAs = [['person_id', 'activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]

# Set batch size.
batch_size = 20000
# ****************************
# I have to add the following block of code to cope with the Jupyter kernel intermittently crashing.
# The code block below checks if `pid_processed` already exists in store, and removes the person IDs
# that have already been processed from the `ls_pids` list.
#
# Check if `pid_processed` exists.
try:
    if len(pid_processed) > 0:
        # The following code only runs if `pid_processed` exists and is loaded.
        ls_pids = list(set(ls_pids).difference(set(pid_processed)))
        print("\nSome person_id values have already been processed. The `ls_pid` iterator will be shortened accordingly.\n")
    else:
        # If `pid_processed` does not exist, then I am starting from scratch
        pid_processed = []
        print("\nNo person_id values have already been processed. All person_id value will be processed.\n")
except:
    # If `pid_processed` does not exist, then I am starting from scratch
    pid_processed = []
    print("\nNo person_id values have already been processed. All person_id value will be processed.\n")
# ****************************

# Check if there are any patient records that still need to be processed.
if len(ls_pids) != 0:
    # Set timer.
    t1 = time.time()
    # Set counter for interim storage.
    store_counter = 1
    # Set counter for storage batch.
    try:
        if len(entropyBasedFS) >= batch_size:
            store_batch_num = 1
        else:
            store_batch_num = 0
    except:
        store_batch_num = 0
    
    # Do the loop.
    for pid in tqdm.tqdm(ls_pids, unit = ' patients'):
        # Extract this particular patient's range of active years.
        pt_years = \
            bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, 'year_appointment'].append(
             bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, 'year_DNA'])

        pt_years_lsrange =  pandas.DataFrame(
            data = { 'year' : list( range( min(pt_years), max(pt_years) ) ) }
            )
        # Create a timeline of years and quarters for this particular patient.
        pt_quarters = pandas.DataFrame( data = {'qtr': [1,2,3,4]} )
        pt_timeline = pt_years_lsrange.merge(pt_quarters, how = 'cross')

        # Join the patient's actual count of appointments-per-quarter-per-year to their timeline.
        pt_appts = bq_countAppointmentsPerQuarter.loc[bq_countAppointmentsPerQuarter.person_id == pid, :]
        pt_timeline_appts = \
            pandas.merge(pt_timeline, pt_appts, how = 'left',
                         left_on = ['year', 'qtr'],
                         right_on = ['year_appointment',
                                     'quarter_appointment']).loc[:,'countAppointmentsPerQuarter'].fillna(0).astype(int)

        # Repeat for did-not-attend events.
        pt_DNAs = bq_countDNAsPerQuarter.loc[bq_countDNAsPerQuarter.person_id == pid, :]
        pt_timeline_DNAs = \
            pandas.merge(pt_timeline, pt_DNAs, how = 'left',
                         left_on = ['year', 'qtr'],
                         right_on = ['year_DNA',
                                     'quarter_DNA']).loc[:,'countDNAsPerQuarter'].fillna(0).astype(int)

        # Create the entropy-based feature sets.
        pt_entropyStats_appts = chaoticlifeentropyfs(pt_timeline_appts)
        ls_entropyBased_fs_appts.append([pid] + pt_entropyStats_appts)
        pt_entropyStats_DNAs = chaoticlifeentropyfs(pt_timeline_DNAs)
        ls_entropyBased_fs_DNAs.append([pid] + pt_entropyStats_DNAs)
        pid_processed.append(pid)
        
        # Check if store threshold has been reached.
        if store_counter == batch_size:
            # If it has been reached, then add to the interim store.
            #
            if store_batch_num == 0:
                entropyBasedFS = pandas.DataFrame(ls_entropyBased_fs_appts[1:], columns = ls_entropyBased_fs_appts[0])
                entropyBasedFS_DNAs = pandas.DataFrame(ls_entropyBased_fs_DNAs[1:], columns = ls_entropyBased_fs_DNAs[0])
                entropyBasedFS = entropyBasedFS.merge(entropyBasedFS_DNAs
                                                      ,how = 'outer'
                                                      ,on = 'person_id')
                entropyBasedFS.set_axis(
                    ['person_id', 'activeInformationAppts', 'entropyRateAppts', 'spectralEntropyAppts', 'sampleEntroptAppts'
                     , 'eoeAppts', 'averageEntropyAppts', 'bubbleEntropyAppts', 'activeInformationDNAs', 'entropyRateDNAs'
                     ,'spectralEntropyDNAs', 'sampleEntropyDNAs', 'eoeDNAs', 'averageEntropyDNAs', 'bubbleEntropyDNAs']
                    ,axis = 1
                    ,inplace = True
                )
            else:
                # Define appendages
                appendage = pandas.DataFrame(ls_entropyBased_fs_appts[1:], columns = ls_entropyBased_fs_appts[0])
                appendage_DNAs = pandas.DataFrame(ls_entropyBased_fs_DNAs[1:], columns = ls_entropyBased_fs_DNAs[0])
                appendage = appendage.merge(appendage_DNAs, how = 'outer', on = 'person_id')

                # Append.
                entropyBasedFS = entropyBasedFS.append(appendage)
            
            # Store the storage dataframe.
            %store entropyBasedFS pid_processed
            
            # Reset the temporary storage.
            ls_entropyBased_fs_appts = [['person_id', 'activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]
            ls_entropyBased_fs_DNAs = [['person_id', 'activeInformation', 'entropyRate', 'spectralEntropy', 'sampleEntropy', 'eoe', 'averageEntropy', 'bubbleEntropy']]
            
            # Reset `store_counter`.
            store_counter = 1
            
            # Update `store_batch_num`.
            store_batch_num += 1
        else:
            # Update `store_counter`.
            store_counter += 1
    
    print(f'It took {time.time() - t1} to process.')
else:
    print("No person_id values remain in `ls_pids`\n")

# Finall update to storage.
#
# Define appendages
appendage = pandas.DataFrame(ls_entropyBased_fs_appts[1:], columns = ls_entropyBased_fs_appts[0])
appendage_DNAs = pandas.DataFrame(ls_entropyBased_fs_DNAs[1:], columns = ls_entropyBased_fs_DNAs[0])
appendage = appendage.merge(appendage_DNAs, how = 'outer', on = 'person_id')

# Append.
try:
    entropyBasedFS.append(appendage)
except:
    entropyBasedFS = pandas.DataFrame(ls_entropyBased_fs_appts[1:], columns = ls_entropyBased_fs_appts[0])
    entropyBasedFS_DNAs = pandas.DataFrame(ls_entropyBased_fs_DNAs[1:], columns = ls_entropyBased_fs_DNAs[0])
    entropyBasedFS = entropyBasedFS.merge(entropyBasedFS_DNAs
                                          ,how = 'outer'
                                          ,on = 'person_id')
    entropyBasedFS.set_axis(
        ['person_id', 'activeInformationAppts', 'entropyRateAppts', 'spectralEntropyAppts', 'sampleEntroptAppts'
         , 'eoeAppts', 'averageEntropyAppts', 'bubbleEntropyAppts', 'activeInformationDNAs', 'entropyRateDNAs'
         ,'spectralEntropyDNAs', 'sampleEntropyDNAs', 'eoeDNAs', 'averageEntropyDNAs', 'bubbleEntropyDNAs']
        ,axis = 1
        ,inplace = True
    )
%store entropyBasedFS pid_processed


No person_id values have already been processed. All person_id value will be processed.



 10%|█         | 20001/200000 [08:24<2:15:39, 22.12 patients/s]

Stored 'entropyBasedFS' (DataFrame)
Stored 'pid_processed' (list)


 20%|█▉        | 39998/200000 [16:52<1:04:32, 41.32 patients/s]

Stored 'entropyBasedFS' (DataFrame)


 20%|██        | 40008/200000 [16:52<2:24:11, 18.49 patients/s]

Stored 'pid_processed' (list)


 30%|██▉       | 59997/200000 [25:16<59:52, 38.97 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 30%|███       | 60005/200000 [25:17<2:58:09, 13.10 patients/s]

Stored 'pid_processed' (list)


 40%|███▉      | 79999/200000 [33:44<50:30, 39.60 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 40%|████      | 80007/200000 [33:45<3:01:52, 11.00 patients/s]

Stored 'pid_processed' (list)


 50%|████▉     | 99996/200000 [42:12<39:21, 42.34 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 50%|█████     | 100005/200000 [42:14<2:41:00, 10.35 patients/s]

Stored 'pid_processed' (list)


 60%|█████▉    | 119999/200000 [50:39<34:01, 39.18 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 60%|██████    | 120007/200000 [50:41<2:38:41,  8.40 patients/s]

Stored 'pid_processed' (list)


 70%|██████▉   | 139997/200000 [59:07<25:10, 39.73 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 70%|███████   | 140006/200000 [59:09<2:12:43,  7.53 patients/s]

Stored 'pid_processed' (list)


 80%|███████▉  | 159998/200000 [1:07:36<17:12, 38.74 patients/s]

Stored 'entropyBasedFS' (DataFrame)


 80%|████████  | 160006/200000 [1:07:39<1:44:46,  6.36 patients/s]

Stored 'pid_processed' (list)


 90%|████████▉ | 179998/200000 [1:16:06<09:10, 36.32 patients/s]  

Stored 'entropyBasedFS' (DataFrame)


 90%|█████████ | 180007/200000 [1:16:10<53:51,  6.19 patients/s]  

Stored 'pid_processed' (list)


100%|█████████▉| 199998/200000 [1:24:35<00:00, 40.18 patients/s]

Stored 'entropyBasedFS' (DataFrame)


100%|██████████| 200000/200000 [1:24:38<00:00, 39.38 patients/s]

Stored 'pid_processed' (list)
It took 5078.947779417038 to process.





Stored 'entropyBasedFS' (DataFrame)
Stored 'pid_processed' (list)


In [5]:
entropyBasedFS

Unnamed: 0,person_id,activeInformationAppts,entropyRateAppts,spectralEntropyAppts,sampleEntroptAppts,eoeAppts,averageEntropyAppts,bubbleEntropyAppts,activeInformationDNAs,entropyRateDNAs,...,eoe_x,averageEntropy_x,bubbleEntropy_x,activeInformation_y,entropyRate_y,spectralEntropy_y,sampleEntropy_y,eoe_y,averageEntropy_y,bubbleEntropy_y
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,1.500000,0.0,,,,,,0.811278,0.0,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,2.155639,0.0,0.646113,,,,0.220703,1.750000,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,199996,,,,,,,,,,...,,,,2.000000,0.0,,,,,
19996,199997,,,,,,,,,,...,,,0.423972,2.606465,0.0,0.603324,,,,0.234901
19997,199998,,,,,,,,,,...,,,,,,,,,,
19998,199999,,,,,,,,,,...,,,,,,,,,,
