In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor


In [5]:
df = pd.read_csv('Covid19_&_Controls.csv')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Columns: 105 entries, Study_Name to Combined_Zscores_Aguilaniu_LLN
dtypes: float64(61), int64(3), object(41)
memory usage: 690.0+ KB


In [7]:
object_cols=df.select_dtypes('object')
for cols in object_cols:
    print(df[cols].value_counts())
    print()

Study_Name
Nunez-Fernandez et al. (2021) J Clin Med https://doi.org/10.3390/jcm    190
Agostoni et al. (2024) Respir Res 25:82                                 140
Lytzen et al. (2024) Exp Physiol                                        129
Barisione & Brusasco (2021) Physiol Rep. 2021 Feb;9(4):e14748.          121
Magini et al. (2015) Eur J Prev Cardiol                                  50
Dal Negro et al. (2022) MultiDisc. Resp. Med 17:875                      48
Barisione & Brusasco (2023) ERJ Open Res. 2023 Apr 17;9(2):00363-202     44
Sese et al. (2022) Revue des Maladies Respiratoires Actualités           42
Imeri G. et al. (2024) MultiDisc. Respir Medicine 19: 938                32
Seccombe et al. (2023) Physiol Rep 11(7) e15660                          21
Sese et al. (2022) ERJ 60 supplement 66. 2662 [abstract]                 14
Thomas Gille Unpublished                                                 10
Name: count, dtype: int64

Study
Nunez-Fernandez M, et al.  J Clin Med 2021;1

In [8]:
#label encoding two variable obj feats
two_val_obj_cols = [
   col for col in object_cols
   if df[col].nunique()<=2
]

In [9]:
lb = LabelEncoder()
for col in two_val_obj_cols:
    df[col]=lb.fit_transform(df[col])

In [10]:
#still need to encode objects
rem_obj_cols=df.select_dtypes('object')
for col in rem_obj_cols:
    print(df[col].value_counts())
    print()


Study_Name
Nunez-Fernandez et al. (2021) J Clin Med https://doi.org/10.3390/jcm    190
Agostoni et al. (2024) Respir Res 25:82                                 140
Lytzen et al. (2024) Exp Physiol                                        129
Barisione & Brusasco (2021) Physiol Rep. 2021 Feb;9(4):e14748.          121
Magini et al. (2015) Eur J Prev Cardiol                                  50
Dal Negro et al. (2022) MultiDisc. Resp. Med 17:875                      48
Barisione & Brusasco (2023) ERJ Open Res. 2023 Apr 17;9(2):00363-202     44
Sese et al. (2022) Revue des Maladies Respiratoires Actualités           42
Imeri G. et al. (2024) MultiDisc. Respir Medicine 19: 938                32
Seccombe et al. (2023) Physiol Rep 11(7) e15660                          21
Sese et al. (2022) ERJ 60 supplement 66. 2662 [abstract]                 14
Thomas Gille Unpublished                                                 10
Name: count, dtype: int64

Study
Nunez-Fernandez M, et al.  J Clin Med 2021;1

In [11]:
df = df.drop('Study_Name', axis=1)
df = df.drop(['Study','Study_Location'], axis=1)


In [12]:



bmi_order = [
    "Underweight (BMI < 18.5)",
    "Normal (BMI (18.5-24-9)",
    "Overweight (BMI 24.9-29.9)",
    "Obese 1 (BMI 30-34.9)",
    "Obese 2 (BMI 35-39.9)",
    "Obese 3 (BMI 40.0+)"
]

zscore_order = [
    "Z-scores more < -5.00",
    "Z-scores between -5.00 and -3.51",
    "Z-scores between -3.50 and -1.646",
    "Z-scores between -1.645 and +1.645",
    "Z-scores > +1.645"
]

days_order = [
    "no covid",
    "17 to 90 days post covid",
    "91 to 180 days post covid",
    "181 to 360 days post covid",
    "370 to 579 days post covid"
]

age_order = [
    "10-19",
    "20-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-90"
]


encoder = OrdinalEncoder(categories=[
    bmi_order,
    zscore_order,
    zscore_order,
    days_order,
    age_order
])


df[[
    'BMI_Categories_encoded',
    'DLNO_Zscore_Classification_encoded',
    'DLCO_Zscore_Classification_encoded',
    'Days_BTW_Group_encoded',
    'Age_Group_encoded'
]] = encoder.fit_transform(
    df[[
        'BMI_Categories',
        'DLNO_Zscore_Classification',
        'DLCO_Zscore_Classification',
        'Days_BTW_Group',
        'Age_Group'
    ]]
)







In [13]:
mat = df.select_dtypes(include=['number']).corr()


In [14]:
df.dtypes.value_counts()

float64    66
int64      36
object      5
Name: count, dtype: int64

In [15]:
df.select_dtypes('object')

Unnamed: 0,Days_BTW_Group,Age_Group,BMI_Categories,DLNO_Zscore_Classification,DLCO_Zscore_Classification
0,no covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -1.645 and +1.645,Z-scores between -1.645 and +1.645
1,91 to 180 days post covid,20-29,Underweight (BMI < 18.5),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
2,91 to 180 days post covid,20-29,Underweight (BMI < 18.5),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
3,91 to 180 days post covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -5.00 and -3.51,Z-scores between -5.00 and -3.51
4,91 to 180 days post covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -3.50 and -1.646,Z-scores between -5.00 and -3.51
...,...,...,...,...,...
836,91 to 180 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
837,91 to 180 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
838,17 to 90 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -1.645 and +1.645,Z-scores between -1.645 and +1.645
839,91 to 180 days post covid,80-90,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646


In [16]:
df = df.drop(df[[
        'BMI_Categories',
        'DLNO_Zscore_Classification',
        'DLCO_Zscore_Classification',
        'Days_BTW_Group',
        'Age_Group'
    ]], axis=1)

In [17]:
#pd.set_option('display.max_columns', None)
X = df.drop(['DLNODLCOratio_Hb', 'DLNODLCOratio_Sealevel', 'DLNODLCOratio_Sealevel_Hb', 'DLNO', 'DLCO', 'DLNODLCOratio', 'SubjectID'], axis=1)
y = df['DLNODLCOratio']

In [18]:

pd.set_option('display.max_rows', None)
df.dtypes

SubjectID                                int64
Age                                    float64
Weight                                 float64
Height                                 float64
BMI                                    float64
Days_BTW                                 int64
Disease                                  int64
Sex                                      int64
Machine                                  int64
Ethnicity                                int64
TLC_LLN                                  int64
Obstruction                              int64
Spirometry_Restriction                   int64
Mixed                                    int64
Obstruction_or_Restriction_or_Mixed      int64
Any_issue                                int64
BHT                                    float64
Altitude                                 int64
pB                                     float64
Hb                                     float64
FEV1                                   float64
FVC          

In [19]:



scaler = StandardScaler()
df[['Days_BTW', 'Altitude']] = scaler.fit_transform(df[['Days_BTW', 'Altitude']])

float_cols = df.select_dtypes(include=['float64']).columns
float_cols = float_cols.drop(['DLNODLCOratio_Hb', 'DLNODLCOratio_Sealevel', 
                              'DLNODLCOratio_Sealevel_Hb', 'DLNO', 'DLCO', 
                              'DLNODLCOratio', 'SubjectID'], errors='ignore')

df[float_cols] = scaler.fit_transform(df[float_cols])


In [20]:
df

Unnamed: 0,SubjectID,Age,Weight,Height,BMI,Days_BTW,Disease,Sex,Machine,Ethnicity,...,Combined_Zscores_GAMLSS_LLN,Combined_Zscores_ERS_LLN,Combined_Zscores_Munkholm_LLN,Combined_Zscores_Zavorsky_LLN,Combined_Zscores_Aguilaniu_LLN,BMI_Categories_encoded,DLNO_Zscore_Classification_encoded,DLCO_Zscore_Classification_encoded,Days_BTW_Group_encoded,Age_Group_encoded
0,3,-2.485939,-0.725444,0.395837,-0.98735,-1.158842,0,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,-1.589759,-2.32159
1,4,-2.267654,-1.875878,-0.741996,-1.808271,0.196057,1,0,0,0,...,1,0,1,1,1,-2.039707,-1.047189,-1.025675,0.317498,-2.32159
2,5,-2.267654,-1.818356,-0.638556,-1.779429,-0.173461,1,0,0,0,...,1,0,1,1,1,-2.039707,-1.047189,-1.025675,0.317498,-2.32159
3,6,-2.122131,-1.415704,-0.12136,-1.518933,-0.296633,1,0,0,0,...,1,1,1,1,1,-1.056673,-2.747356,-2.710427,0.317498,-2.32159
4,7,-2.04937,-1.530747,-0.01792,-1.689478,-0.35822,1,0,0,0,...,1,1,1,1,1,-1.056673,-1.047189,-2.710427,0.317498,-2.32159
5,8,-1.758324,0.597555,0.499276,0.358443,-0.235047,1,0,0,0,...,1,1,1,1,1,-0.07364,-1.047189,-1.025675,0.317498,-1.609274
6,9,-1.612801,-1.588269,-1.56951,-1.071723,-1.158842,0,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,-1.589759,-1.609274
7,10,-1.540039,-0.437835,-1.362632,0.332643,-1.158842,0,0,0,0,...,0,0,0,0,0,-0.07364,0.652977,0.659076,-1.589759,-1.609274
8,11,-1.321755,-1.473226,-0.948874,-1.224335,-0.050288,1,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,0.317498,-1.609274
9,12,-0.957947,0.137382,-0.12136,0.235009,0.134471,1,0,0,0,...,0,0,0,0,0,-0.07364,0.652977,0.659076,0.317498,-0.896959


In [21]:
model = RandomForestRegressor()


rfe = RFE(estimator=model, n_features_to_select=25)


rfe.fit(X, y)


selected_features = X.columns[rfe.support_]  
ranking = rfe.ranking_                       

print("Selected Features:", list(selected_features))


Selected Features: ['Age', 'Weight', 'BMI', 'Days_BTW', 'Machine', 'BHT', 'Altitude', 'pB', 'Hb', 'FEV1_FVC_Ratio', 'TLC', 'FVC_minus_TLC', 'VA_minus_FVC', 'VA_minus_TLC', 'DLCO_Hb_Sealevel', 'KCO', 'KCO_Hb', 'KCO_Sealevel', 'KCO_Hb_Sealevel', 'KNO', 'FEV1_Zscores', 'DLCO_Zscores_Aguilaniu', 'DLCO_Zscores_GLI', 'DLNO_Zscores_GAMLSS', 'DLNO_Zscores_Munkholm']


In [28]:
new_df = df[selected_features]
new_df['DLNODLCOratio'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['DLNODLCOratio'] = y


In [30]:
new_df.to_csv("Covid-19vsControls_randomregressor.csv", index=False)

0      6.07
1      5.51
2      5.81
3      4.96
4      6.26
5      5.40
6      5.13
7      5.23
8      5.57
9      5.23
10     6.18
11     6.29
12     5.62
13     5.37
14     5.28
15     5.29
16     5.59
17     5.41
18     5.93
19     5.34
20     5.65
21     5.77
22     5.06
23     5.40
24     5.32
25     5.55
26     5.68
27     5.29
28     5.07
29     5.21
30     5.01
31     5.24
32     5.16
33     5.55
34     5.08
35     5.48
36     5.69
37     5.03
38     6.75
39     5.45
40     6.87
41     5.45
42     4.84
43     5.64
44     5.82
45     6.52
46     6.90
47     5.56
48     3.77
49     4.42
50     3.63
51     4.40
52     3.98
53     4.83
54     4.40
55     4.99
56     4.30
57     4.43
58     3.98
59     4.14
60     4.41
61     4.05
62     4.16
63     3.87
64     4.61
65     4.02
66     4.15
67     4.19
68     3.62
69     3.87
70     4.48
71     3.50
72     3.83
73     4.29
74     4.11
75     4.14
76     3.89
77     4.10
78     4.37
79     4.48
80     3.97
81     3.77
82     4.27
83  

Unnamed: 0,SubjectID,Age,Weight,Height,BMI,Days_BTW,Disease,Sex,Machine,Ethnicity,...,Combined_Zscores_GAMLSS_LLN,Combined_Zscores_ERS_LLN,Combined_Zscores_Munkholm_LLN,Combined_Zscores_Zavorsky_LLN,Combined_Zscores_Aguilaniu_LLN,BMI_Categories_encoded,DLNO_Zscore_Classification_encoded,DLCO_Zscore_Classification_encoded,Days_BTW_Group_encoded,Age_Group_encoded
0,3,-2.485939,-0.725444,0.395837,-0.98735,-1.158842,0,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,-1.589759,-2.32159
1,4,-2.267654,-1.875878,-0.741996,-1.808271,0.196057,1,0,0,0,...,1,0,1,1,1,-2.039707,-1.047189,-1.025675,0.317498,-2.32159
2,5,-2.267654,-1.818356,-0.638556,-1.779429,-0.173461,1,0,0,0,...,1,0,1,1,1,-2.039707,-1.047189,-1.025675,0.317498,-2.32159
3,6,-2.122131,-1.415704,-0.12136,-1.518933,-0.296633,1,0,0,0,...,1,1,1,1,1,-1.056673,-2.747356,-2.710427,0.317498,-2.32159
4,7,-2.04937,-1.530747,-0.01792,-1.689478,-0.35822,1,0,0,0,...,1,1,1,1,1,-1.056673,-1.047189,-2.710427,0.317498,-2.32159
5,8,-1.758324,0.597555,0.499276,0.358443,-0.235047,1,0,0,0,...,1,1,1,1,1,-0.07364,-1.047189,-1.025675,0.317498,-1.609274
6,9,-1.612801,-1.588269,-1.56951,-1.071723,-1.158842,0,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,-1.589759,-1.609274
7,10,-1.540039,-0.437835,-1.362632,0.332643,-1.158842,0,0,0,0,...,0,0,0,0,0,-0.07364,0.652977,0.659076,-1.589759,-1.609274
8,11,-1.321755,-1.473226,-0.948874,-1.224335,-0.050288,1,0,0,0,...,0,0,0,0,0,-1.056673,0.652977,0.659076,0.317498,-1.609274
9,12,-0.957947,0.137382,-0.12136,0.235009,0.134471,1,0,0,0,...,0,0,0,0,0,-0.07364,0.652977,0.659076,0.317498,-0.896959
