In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = "project_1_train.csv"  
data = pd.read_csv('project_1_train.csv', usecols=['NR', 'Gender', 'Age', 'Race1', 'Race3', 'Education', 'MaritalStatus', 'HHIncome', 'HHIncomeMid', 'Poverty', 'HomeRooms', 'HomeOwn', 'Work', 'Weight', 'Height', 'BMI', 'BMICatUnder20yrs', 'BMI_WHO', 'Pulse', 'BPSysAve', 'BPDiaAve', 'BPSys1', 'BPDia1', 'BPSys2', 'BPSys2', 'BPSys3', 'BPDia3', 'Testosterone', 'UrineVol1', 'UrineFlow1', 'Diabetes', 'HealthGen', 'DaysPhysHlthBad', 'DaysMentHlthBad', 'Depressed', 'SleepHrsNight', 'SleepTrouble', 'PhysActive', 'PhysActiveDays', 'TVHrsDay', 'CompHrsDay', 'Alcohol12PlusYr', 'AlcoholDay', 'AlcoholYear', 'SmokeNow', 'Smoke100', 'Smoke100n', 'HardDrugs', 'SexEver', 'SexAge', 'SexNumPartnLife', 'SameSex', 'DirectChol'
])

In [3]:
missing_rows = data.isnull().any(axis=1).sum()
print(f"There are a total of {missing_rows} rows with missing data in the dataset.")

There are a total of 8000 rows with missing data in the dataset.


In [4]:
numerical_columns = data.select_dtypes(include=[np.number]).columns.to_list()
numerical_data = data[numerical_columns]


In [5]:
knn_imputer = KNNImputer(n_neighbors=5)
imputed_numerical_data = knn_imputer.fit_transform(numerical_data)
imputed_numerical_df = pd.DataFrame(imputed_numerical_data, columns=numerical_columns)

In [6]:
categorical_columns = data.select_dtypes(exclude=[np.number]).columns.to_list()
categorical_data = data[categorical_columns]

In [7]:
for col in categorical_columns:
    mode = categorical_data[col].mode()[0]  
    categorical_data[col].fillna(mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  categorical_data[col].fillna(mode, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_data[col].fillna(mode, inplace=True)


In [8]:
imputed_data = pd.concat([imputed_numerical_df, categorical_data], axis=1)

In [9]:
print(f"Are there still missing data? {imputed_data.isnull().any().any()}")

Are there still missing data? False


In [10]:
imputed_data.to_csv("project_1_train_imputed.csv", index=False)
print("The dataset with filled missing values has been saved to the file 'project_1_train_imputed.csv'.")

The dataset with filled missing values has been saved to the file 'project_1_train_imputed.csv'.


In [16]:
desired_columns = [
    "NR", "Gender", "Age", "Race1", "Race3", 
    "Education", "MaritalStatus", "HHIncome", "HHIncomeMid", "Poverty", 
    "HomeRooms", "HomeOwn", "Work", "Weight", "Height", "BMI", 
    "BMICatUnder20yrs", "BMI_WHO", "Pulse", "BPSysAve", "BPDiaAve",
    "BPSys1", "BPDia1", "BPSys2", "BPSys2", "BPSys3", "BPDia3", 
    "Testosterone", "UrineVol1", "UrineFlow1", "Diabetes",
    "HealthGen", "DaysPhysHlthBad", "DaysMentHlthBad", "Depressed", 
    "SleepHrsNight", "SleepTrouble", "PhysActive", "PhysActiveDays", 
    "TVHrsDay", "CompHrsDay", "Alcohol12PlusYr", "AlcoholDay", 
    "AlcoholYear", "SmokeNow", "Smoke100", "Smoke100n", "HardDrugs", 
    "SexEver", "SexAge", "SexNumPartnLife", "SameSex" , "DirectChol"

]

In [17]:
if all(col in imputed_data.columns for col in desired_columns):
    imputed_data = imputed_data[desired_columns]
    print("Successfully completed!")
else:
    missing_cols = [col for col in desired_columns if col not in imputed_data.columns]
    print(f"The following columns were not found in the dataset: {missing_cols}")


Successfully completed!


In [18]:
imputed_data.to_csv("project_1_train_imputed_ordered.csv", index=False)
print("The dataset with adjusted column order has been saved to the file 'project_1_train_imputed_ordered.csv'.")

The dataset with adjusted column order has been saved to the file 'project_1_train_imputed_ordered.csv'.
