In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor


In [5]:
df = pd.read_csv('Covid19_&_Controls.csv')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Columns: 105 entries, Study_Name to Combined_Zscores_Aguilaniu_LLN
dtypes: float64(61), int64(3), object(41)
memory usage: 690.0+ KB


In [7]:
object_cols=df.select_dtypes('object')
for cols in object_cols:
    print(df[cols].value_counts())
    print()

Study_Name
Nunez-Fernandez et al. (2021) J Clin Med https://doi.org/10.3390/jcm    190
Agostoni et al. (2024) Respir Res 25:82                                 140
Lytzen et al. (2024) Exp Physiol                                        129
Barisione & Brusasco (2021) Physiol Rep. 2021 Feb;9(4):e14748.          121
Magini et al. (2015) Eur J Prev Cardiol                                  50
Dal Negro et al. (2022) MultiDisc. Resp. Med 17:875                      48
Barisione & Brusasco (2023) ERJ Open Res. 2023 Apr 17;9(2):00363-202     44
Sese et al. (2022) Revue des Maladies Respiratoires Actualités           42
Imeri G. et al. (2024) MultiDisc. Respir Medicine 19: 938                32
Seccombe et al. (2023) Physiol Rep 11(7) e15660                          21
Sese et al. (2022) ERJ 60 supplement 66. 2662 [abstract]                 14
Thomas Gille Unpublished                                                 10
Name: count, dtype: int64

Study
Nunez-Fernandez M, et al.  J Clin Med 2021;1

In [8]:
#label encoding two variable obj feats
two_val_obj_cols = [
   col for col in object_cols
   if df[col].nunique()<=2
]

In [9]:
lb = LabelEncoder()
for col in two_val_obj_cols:
    df[col]=lb.fit_transform(df[col])

In [10]:
#still need to encode objects
rem_obj_cols=df.select_dtypes('object')
for col in rem_obj_cols:
    print(df[col].value_counts())
    print()


Study_Name
Nunez-Fernandez et al. (2021) J Clin Med https://doi.org/10.3390/jcm    190
Agostoni et al. (2024) Respir Res 25:82                                 140
Lytzen et al. (2024) Exp Physiol                                        129
Barisione & Brusasco (2021) Physiol Rep. 2021 Feb;9(4):e14748.          121
Magini et al. (2015) Eur J Prev Cardiol                                  50
Dal Negro et al. (2022) MultiDisc. Resp. Med 17:875                      48
Barisione & Brusasco (2023) ERJ Open Res. 2023 Apr 17;9(2):00363-202     44
Sese et al. (2022) Revue des Maladies Respiratoires Actualités           42
Imeri G. et al. (2024) MultiDisc. Respir Medicine 19: 938                32
Seccombe et al. (2023) Physiol Rep 11(7) e15660                          21
Sese et al. (2022) ERJ 60 supplement 66. 2662 [abstract]                 14
Thomas Gille Unpublished                                                 10
Name: count, dtype: int64

Study
Nunez-Fernandez M, et al.  J Clin Med 2021;1

In [11]:
df = df.drop('Study_Name', axis=1)
df = df.drop(['Study','Study_Location'], axis=1)


In [12]:



bmi_order = [
    "Underweight (BMI < 18.5)",
    "Normal (BMI (18.5-24-9)",
    "Overweight (BMI 24.9-29.9)",
    "Obese 1 (BMI 30-34.9)",
    "Obese 2 (BMI 35-39.9)",
    "Obese 3 (BMI 40.0+)"
]

zscore_order = [
    "Z-scores more < -5.00",
    "Z-scores between -5.00 and -3.51",
    "Z-scores between -3.50 and -1.646",
    "Z-scores between -1.645 and +1.645",
    "Z-scores > +1.645"
]

days_order = [
    "no covid",
    "17 to 90 days post covid",
    "91 to 180 days post covid",
    "181 to 360 days post covid",
    "370 to 579 days post covid"
]

age_order = [
    "10-19",
    "20-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-90"
]


encoder = OrdinalEncoder(categories=[
    bmi_order,
    zscore_order,
    zscore_order,
    days_order,
    age_order
])


df[[
    'BMI_Categories_encoded',
    'DLNO_Zscore_Classification_encoded',
    'DLCO_Zscore_Classification_encoded',
    'Days_BTW_Group_encoded',
    'Age_Group_encoded'
]] = encoder.fit_transform(
    df[[
        'BMI_Categories',
        'DLNO_Zscore_Classification',
        'DLCO_Zscore_Classification',
        'Days_BTW_Group',
        'Age_Group'
    ]]
)







In [13]:
mat = df.select_dtypes(include=['number']).corr()


In [14]:
df.dtypes.value_counts()

float64    66
int64      36
object      5
Name: count, dtype: int64

In [15]:
df.select_dtypes('object')

Unnamed: 0,Days_BTW_Group,Age_Group,BMI_Categories,DLNO_Zscore_Classification,DLCO_Zscore_Classification
0,no covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -1.645 and +1.645,Z-scores between -1.645 and +1.645
1,91 to 180 days post covid,20-29,Underweight (BMI < 18.5),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
2,91 to 180 days post covid,20-29,Underweight (BMI < 18.5),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
3,91 to 180 days post covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -5.00 and -3.51,Z-scores between -5.00 and -3.51
4,91 to 180 days post covid,20-29,Normal (BMI (18.5-24-9),Z-scores between -3.50 and -1.646,Z-scores between -5.00 and -3.51
...,...,...,...,...,...
836,91 to 180 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
837,91 to 180 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646
838,17 to 90 days post covid,70-79,Overweight (BMI 24.9-29.9),Z-scores between -1.645 and +1.645,Z-scores between -1.645 and +1.645
839,91 to 180 days post covid,80-90,Overweight (BMI 24.9-29.9),Z-scores between -3.50 and -1.646,Z-scores between -3.50 and -1.646


In [16]:
df = df.drop(df[[
        'BMI_Categories',
        'DLNO_Zscore_Classification',
        'DLCO_Zscore_Classification',
        'Days_BTW_Group',
        'Age_Group'
    ]], axis=1)

In [17]:
#pd.set_option('display.max_columns', None)
X = df.drop(['DLNODLCOratio_Hb', 'DLNODLCOratio_Sealevel', 'DLNODLCOratio_Sealevel_Hb', 'DLNO', 'DLCO', 'DLNODLCOratio', 'SubjectID'], axis=1)
y = df['DLNODLCOratio']

In [18]:

pd.set_option('display.max_rows', None)
df.dtypes

SubjectID                                int64
Age                                    float64
Weight                                 float64
Height                                 float64
BMI                                    float64
Days_BTW                                 int64
Disease                                  int64
Sex                                      int64
Machine                                  int64
Ethnicity                                int64
TLC_LLN                                  int64
Obstruction                              int64
Spirometry_Restriction                   int64
Mixed                                    int64
Obstruction_or_Restriction_or_Mixed      int64
Any_issue                                int64
BHT                                    float64
Altitude                                 int64
pB                                     float64
Hb                                     float64
FEV1                                   float64
FVC          

In [19]:



scaler = StandardScaler()
df[['Days_BTW', 'Altitude']] = scaler.fit_transform(df[['Days_BTW', 'Altitude']])

float_cols = df.select_dtypes(include=['float64']).columns
float_cols = float_cols.drop(['DLNODLCOratio_Hb', 'DLNODLCOratio_Sealevel', 
                              'DLNODLCOratio_Sealevel_Hb', 'DLNO', 'DLCO', 
                              'DLNODLCOratio', 'SubjectID'], errors='ignore')

df[float_cols] = scaler.fit_transform(df[float_cols])


In [21]:
model = RandomForestRegressor()


rfe = RFE(estimator=model, n_features_to_select=25)


rfe.fit(X, y)


selected_features = X.columns[rfe.support_]  
ranking = rfe.ranking_                       

print("Selected Features:", list(selected_features))


Selected Features: ['Age', 'Weight', 'BMI', 'Days_BTW', 'Machine', 'BHT', 'Altitude', 'pB', 'Hb', 'FEV1_FVC_Ratio', 'TLC', 'FVC_minus_TLC', 'VA_minus_FVC', 'VA_minus_TLC', 'DLCO_Hb_Sealevel', 'KCO', 'KCO_Hb', 'KCO_Sealevel', 'KCO_Hb_Sealevel', 'KNO', 'FEV1_Zscores', 'DLCO_Zscores_Aguilaniu', 'DLCO_Zscores_GLI', 'DLNO_Zscores_GAMLSS', 'DLNO_Zscores_Munkholm']


In [28]:
new_df = df[selected_features]
new_df['DLNODLCOratio'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['DLNODLCOratio'] = y


In [30]:
new_df.to_csv("Covid-19vsControls_randomregressor.csv", index=False)