In [1]:
import pandas as pd

literature = pd.read_csv('./literature_cleaned.csv')
columns = literature.columns
print(columns)

Index(['Unnamed: 0', 'Link', 'Code Available?', 'Public Dataset',
       'Private Dataset', 'Multiple Datasets?',
       'Dataset Distribution included?', 'Sample size for Evaluation ',
       'Hyperparameters Reported?',
       'Method for tuning hyperparameters reported?',
       'Evaluation metric reported (RMSE / MAPE, MAE)',
       'Include all reported prediction horizon (like, future 30, future 60 ...)',
       'Sampling horizon', 'Baseline included?', 'Variance Reported?',
       'other lifestyle factors applied to train model',
       'Best reported result (prediction horizon) 30 is the primary',
       'Best reported metric', 'Best reported prediction horizon',
       'model type ', 'Link.1', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23',
       'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')


In [2]:
# Specify the column name
column_name = 'Code Available?'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Code Available?
No     42
Yes    11
Name: count, dtype: int64


In [3]:
11/53

0.20754716981132076

In [4]:
# Count the number of rows with "OhioT1DM" in the "Public Dataset" column
ohio_count = literature[literature['Public Dataset'].str.contains('OhioT1DM', na=False)].shape[0]

# Count the number of rows without "OhioT1DM" in the "Public Dataset" column
not_ohio_count = literature[~literature['Public Dataset'].str.contains('OhioT1DM', na=False)].shape[0]

print(f"Number of rows with 'OhioT1DM': {ohio_count}")
print(f"Number of rows without 'OhioT1DM': {not_ohio_count}")

Number of rows with 'OhioT1DM': 29
Number of rows without 'OhioT1DM': 31


In [5]:
# Specify the column name
column_name = 'Public Dataset'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Public Dataset
OhioT1DM                                18
OhioT1DM                                 7
DirecNet                                 4
OhioT1DM, ShanghaiT1DM, ShanghaiT2DM     2
RT-CGM                                   2
UCI ML Repository                        1
OhioT1DM, Maastricht Study               1
ShanghaiT1DM                             1
Yes, also SUCH, but earlier data         1
DirectNet, AI4PG                         1
MIMIC-III                                1
ShanghaiT1DM, ShanghaiT2DM               1
Custom Open-Source                       1
OhioT1DM, DCLP3, DCLP5, RT-CGM           1
D1NAMO                                   1
GEM-GDM                                  1
Name: count, dtype: int64


In [6]:
# Assuming your DataFrame is named df
na_count = literature["Public Dataset"].isna().sum()
print(f"Number of NaN rows in 'Public Dataset': {na_count}")

Number of NaN rows in 'Public Dataset': 16


In [7]:
# public dataset
44/53

0.8301886792452831

In [8]:
# Specify the column name
column_name = 'Multiple Datasets?'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Multiple Datasets?
No     35
Yes    18
Name: count, dtype: int64


In [13]:
# Specify the column name
column_name = 'Hyperparameters Reported?'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Hyperparameters Reported?
Yes       35
Partly    10
No         7
Yes        1
Name: count, dtype: int64


In [14]:
# Specify the column name
column_name = 'Method for tuning hyperparameters reported?'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Method for tuning hyperparameters reported?
No                                         22
Yes                                        15
Grid Search                                 8
Grid search                                 4
Hyperband Tuner                             1
Lipschitz order index, OBS, Grid Search     1
Tabu search                                 1
Trial and Error                             1
IPSO Algorithm                              1
Name: count, dtype: int64


In [15]:
# Parameter tuning reported ratio
(53-22)/53

0.5849056603773585

In [9]:
# Specify the column name
column_name = 'Best reported prediction horizon'

# Print the count of different types of values in the specified column
value_counts = literature[column_name].value_counts()
print(value_counts)

Best reported prediction horizon
30          44
60           2
120          1
15           1
30 (APE)     1
NAN          1
120 (R)      1
Name: count, dtype: int64


In [10]:
# 30 prediction horizon reported ratio
44/53

0.8301886792452831

In [18]:
49/53 # Only 4 paper didn't report the RMSE

0.9245283018867925

In [19]:
literature["Best reported prediction horizon"]

0           30
1           30
2           30
3           30
4           30
5          NaN
6           30
7           30
8           30
9           30
10         NaN
11          30
12         NaN
13          30
14          30
15          30
16          30
17          30
18          30
19          30
20          30
21         NaN
22          30
23          30
24          30
25          60
26          30
27          30
28         120
29          15
30          30
31          30
32         NaN
33          30
34          30
35         NaN
36          30
37          30
38          30
39          60
40          30
41         NaN
42          30
43          30
44    30 (APE)
45          30
46          30
47          30
48          30
49          30
50          30
51         NAN
52         NaN
53          30
54          30
55          30
56         NaN
57          30
58          30
59     120 (R)
Name: Best reported prediction horizon, dtype: object

In [20]:
import pandas as pd

# Assuming your DataFrame is named df
# Filter rows where "Best reported prediction horizon" equals 30
# Convert "Best reported prediction horizon" to integer format
# Convert "Best reported prediction horizon" to numeric format, coercing errors to NaN
literature["Best reported prediction horizon"] = pd.to_numeric(
    literature["Best reported prediction horizon"], errors="coerce"
)

# Handle NaN values (e.g., drop rows with NaN in this column)
literature = literature.dropna(subset=["Best reported prediction horizon"])

# Convert the column to integer format
literature["Best reported prediction horizon"] = literature["Best reported prediction horizon"].astype(int)

filtered_df = literature[literature["Best reported prediction horizon"] == 30]

# Define bins and labels for the ranges
bins = [0, 5, 10, 15, 20, 25, 30]
labels = ["0-5", "5-10", "10-15", "15-20", "20-25", "25-30"]

# Categorize "Best reported metric" into ranges
filtered_df["Metric Range"] = pd.cut(filtered_df["Best reported metric"], bins=bins, labels=labels, right=False)

# Calculate count and ratio for each range
range_counts = filtered_df["Metric Range"].value_counts(sort=False)
range_ratios = range_counts / range_counts.sum()

# Display the results
print("Counts:")
print(range_counts)
print("\nRatios:")
print(range_ratios)

Counts:
Metric Range
0-5       1
5-10      6
10-15     2
15-20    26
20-25     8
25-30     1
Name: count, dtype: int64

Ratios:
Metric Range
0-5      0.022727
5-10     0.136364
10-15    0.045455
15-20    0.590909
20-25    0.181818
25-30    0.022727
Name: count, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Metric Range"] = pd.cut(filtered_df["Best reported metric"], bins=bins, labels=labels, right=False)
