In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [11]:
conditions

array(['critical', 'mild', 'moderate', 'severe', 'terminal'], dtype=object)

In [2]:
train_df = pd.read_csv("train.csv")

In [3]:
heavy_columns = train_df.select_dtypes(include = "float64")
train_df["birthdate"] = pd.to_datetime(train_df["birthdate"])

In [4]:
def drop_columns(df):
    df.drop(["id", "address", "mail", "username", "name"], axis = 1, inplace = True)

def downsize(df):
    heavy_columns = df.select_dtypes("float64").columns
    for i in heavy_columns:
        df[i] = df[i].astype("float32")
    return df

def extract(df):
    pattern = r'(-?\d*\.?\d+)'
    object_dtypes = df.select_dtypes(include = "object").columns
    for i in object_dtypes:
        df[i] = df[i].str.extract(pattern)
        df[i] = df[i].astype('float32')
    return df

def encode(df, train = True):
    if (train):
        label_enc_cond, label_enc_sex = LabelEncoder(), LabelEncoder()
        df["Condition"] = label_enc_cond.fit_transform(df["Condition"])
        df["sex"] = label_enc_sex.fit_transform(df["sex"])
        
        conditions = label_enc_cond.inverse_transform([0, 1, 2, 3, 4])
        sex = label_enc_sex.inverse_transform([0, 1])
        return conditions, sex
    else:
        label_enc_sex = LabelEncoder()
        df["sex"] = label_enc_sex.fit_transform(df["sex"])
        sex = label_enc_sex.inverse_transform([0, 1])
        return sex

def impute(df):
    numeric_columns = list(df.select_dtypes(include = ["float32"]).columns)
    imp = SimpleImputer(strategy='mean')
    columns_with_nulls = df.columns[df.isnull().any()]

    for i in columns_with_nulls:
        df[[i]] = imp.fit_transform(df[[i]])
    return df

def preprocess(df, columns):
    original_dtypes = df[columns].dtypes
    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), columns),
        ],
        remainder='passthrough'
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor)
    ])

    transformed_data = pd.DataFrame(pipeline.fit_transform(df[columns]), columns=columns)
    df[columns] = transformed_data
    return df

In [5]:
conditions, sex = encode(train_df)
drop_columns(train_df)
train_df = extract(train_df)
train_df = downsize(train_df)
train_df = impute(train_df)
train_df = preprocess(train_df, train_df.select_dtypes(include="float32").columns)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 53 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   sex        180000 non-null  int64  
 1   birthdate  180000 non-null  float64
 2   NuuVLWVE   180000 non-null  float32
 3   EKlEdV0k   180000 non-null  float32
 4   KQ7f9qDP   180000 non-null  float32
 5   D2rRypP1   180000 non-null  float32
 6   JC6b3NlG   180000 non-null  float32
 7   m5OuvpQO   180000 non-null  float32
 8   wboRXDiy   180000 non-null  float32
 9   8nHIrONK   180000 non-null  float32
 10  ymxc3enI   180000 non-null  float32
 11  aauhNN75   180000 non-null  float32
 12  u6eGD9z2   180000 non-null  float32
 13  zNyeK64b   180000 non-null  float32
 14  4pJNIVmd   180000 non-null  float32
 15  tTiXwwrv   180000 non-null  float32
 16  eilg7pwe   180000 non-null  float32
 17  UKcSzGxW   180000 non-null  float32
 18  9cy49cuB   180000 non-null  float32
 19  LoEvGvKF   180000 non-n

In [7]:
train_df.to_csv("cleaned.csv")

In [8]:
test_df = pd.read_csv('test.csv')
test_df["birthdate"] = pd.to_datetime(test_df["birthdate"])
drop_columns(test_df)
encode(test_df, train = False)
test_df = extract(test_df)
test_df = downsize(test_df)
test_df = impute(test_df)
test_df = preprocess(test_df, test_df.select_dtypes(include="float32").columns)

In [9]:
test_df.to_csv("cleaned_test.csv")

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 52 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        20000 non-null  int64  
 1   birthdate  20000 non-null  float64
 2   NuuVLWVE   20000 non-null  float32
 3   EKlEdV0k   20000 non-null  float32
 4   KQ7f9qDP   20000 non-null  float32
 5   D2rRypP1   20000 non-null  float32
 6   JC6b3NlG   20000 non-null  float32
 7   m5OuvpQO   20000 non-null  float32
 8   wboRXDiy   20000 non-null  float32
 9   8nHIrONK   20000 non-null  float32
 10  ymxc3enI   20000 non-null  float32
 11  aauhNN75   20000 non-null  float32
 12  u6eGD9z2   20000 non-null  float32
 13  zNyeK64b   20000 non-null  float32
 14  4pJNIVmd   20000 non-null  float32
 15  tTiXwwrv   20000 non-null  float32
 16  eilg7pwe   20000 non-null  float32
 17  UKcSzGxW   20000 non-null  float32
 18  9cy49cuB   20000 non-null  float32
 19  LoEvGvKF   20000 non-null  float32
 20  YaBsho