# Data-preprocessing

## 1. temporarily fill the missing data using mean value
Using mean or median temporarily fills missing values, so normalization can be applied without errors.

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv('./TrainDataset2024.csv')
missing_mask = data.select_dtypes(include=[float, int]) == 999
temp_imputer = SimpleImputer(missing_values=999, strategy='mean')
data_temp_filled = temp_imputer.fit_transform(data.select_dtypes(include=[float, int]))

## 2. Normalize the data
Normalizing the data before using KNN is crucial because KNN relies on distance metrics (such as Euclidean distance) to identify the nearest neighbors.

And I choose to use MinMaxScaler to do normalizing to scale features to a \[0, 1\] range

In [2]:
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data_temp_filled)

## 3. Apply KNN imputer on normalized data
The KNN imputer now works on normalized data, allowing distances to be computed meaningfully across all features.

In [3]:
data_for_knn = data_normalized.copy()
data_for_knn[missing_mask] = 999
knn_imputer = KNNImputer(missing_values=999, n_neighbors=6)
knn_imputer.fit(data_normalized)
data_imputed = knn_imputer.transform(data_for_knn)

## 4. Convert back to DataFrame and reverse normalization
Because the data should be trained at the original data, and the test input will be original scale. So I have to reverse the normalization.

In [4]:
data_imputed_df = pd.DataFrame(scaler.inverse_transform(data_imputed),
                               columns=data.select_dtypes(include=[float, int]).columns)
data_imputed_df.to_csv('./before_round.csv', index=False)
data_imputed_df

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1.0,144.000000,41.0,0.0,0.0,0.0,1.0,3.0,3.0,1.0,...,0.517172,0.375126,3.325332,0.002314,3.880772e+06,473.464852,0.000768,0.182615,0.030508,0.000758
1,0.0,142.000000,39.0,1.0,1.0,0.0,0.0,3.0,3.0,1.0,...,0.444391,0.444391,3.032144,0.005612,2.372010e+06,59.459710,0.004383,0.032012,0.001006,0.003685
2,1.0,135.000000,31.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,...,0.534549,0.534549,2.485848,0.006752,1.540027e+06,33.935384,0.007584,0.024062,0.000529,0.006447
3,0.0,12.000000,35.0,0.0,0.0,0.0,1.0,3.0,3.0,1.0,...,0.506185,0.506185,2.606255,0.003755,6.936741e+06,46.859265,0.005424,0.013707,0.000178,0.004543
4,0.0,109.000000,61.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.462282,0.462282,2.809279,0.006521,1.265399e+06,39.621023,0.006585,0.034148,0.001083,0.005626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.0,54.500000,58.5,1.0,0.0,1.0,0.0,3.0,2.0,1.0,...,0.476493,0.476493,2.453583,0.003229,2.327038e+06,18.562377,0.013766,0.018042,0.000288,0.012257
396,0.0,49.250000,34.3,0.0,0.0,0.0,1.0,3.0,3.0,1.0,...,0.418382,0.418382,2.995603,0.004243,1.005061e+06,156.627179,0.002228,0.136015,0.022148,0.002098
397,0.0,48.500000,53.3,0.0,0.0,0.0,1.0,2.0,1.0,1.0,...,0.527779,0.527778,1.500000,0.003728,2.132007e+05,0.996746,0.252582,0.007380,0.000037,0.231059
398,0.0,47.500000,68.8,1.0,0.0,0.0,0.0,3.0,3.0,1.0,...,0.313693,0.313693,3.573557,0.001112,2.008034e+07,204.864200,0.001372,0.054063,0.003697,0.001368


## 5. Round the knn result to 0 or 1 for categorical data
Because some of the data are categorical, so we have to use either 0 or 1. So I rounded the float number
### 5.1 find all the categorical data

In [5]:
categorical_cols = [col for col in data.columns if data[col].dropna().isin([0, 1, 2, 3, 4, 5, 6, 999]).all()]
print("Categorical columns:", categorical_cols)

Categorical columns: ['pCR (outcome)', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage', 'Gene']


### 5.2 round the data

In [6]:
for col in categorical_cols:
    data_imputed_df[col] = data_imputed_df[col].round().astype(int)
# data_imputed_df
data_imputed_df.to_csv("./output.csv", index=False)