# Esophageal Cancer Research - Data Preprocessing
* By Sangwon Baek
* Samsung Medical Center
* August 3rd, 2023

### Import necessary packages and read data

In [1]:
# Import necessary modules
from ..src.data_preprocessing import main as preprocess_data

# Load the preprocessed data for further analysis or visualization
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Run the preprocessing
preprocess_data()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2663 entries, 0 to 2662
Columns: 136 entries, n_No to pStage7 
dtypes: datetime64[ns](1), float64(119), object(16)
memory usage: 2.8+ MB


In [3]:
df = pd.read_csv("../data/preprocessed/ECA_Dataset.csv")

# Display basic information
print(df.info())

# Generate descriptive statistics
print(df.describe())

In [4]:
# Calculate and display missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
missing_df = missing_df.sort_values(by='Missing Values', ascending=False)
print(missing_df)

In [5]:
# Plot histograms for numerical columns
numerical_cols = df.select_dtypes(include=['float64']).columns[:10]
df[numerical_cols].hist(bins=30, figsize=(20, 15))
plt.tight_layout()
plt.show()

In [6]:
# Create boxplots for numerical columns
plt.figure(figsize=(20, 15))
for i, col in enumerate(numerical_cols, start=1):
    plt.subplot(5, 2, i)
    df.boxplot(col)
plt.tight_layout()
plt.show()