[Reference](https://levelup.gitconnected.com/mastering-feature-engineering-process-in-data-science-6897ba5a2d7a)

In [1]:
import pandas as pd

# Load the dataset
file_path = 'patient_data.csv'
dataset = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
dataset.head()

In [2]:
# Calculating statistical summary for numerical variables
numerical_summary = dataset.describe()

# Calculating summary for categorical variables
categorical_summary = dataset.describe(include=['object'])
numerical_summary, categorical_summary

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculating the correlation between Weight and Height
correlation = dataset['Weight'].corr(dataset['Height'])

# Creating a scatter plot to visualize the relationship between Weight and Height
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dataset, x='Height', y='Weight')
plt.title('Relationship between Weight and Height of Patients')
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.grid(True)

# Displaying the correlation coefficient
plt.figtext(0.5, 0.01, f"Correlation Coefficient: {correlation:.2f}", ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
plt.show()

correlation

In [4]:
# Calculating the Body Mass Index (BMI)
dataset['BMI'] = (dataset['Weight'] / (dataset['Height'] / 100) ** 2).round(0)

# Displaying the first few lines of the dataset to check the new column
dataset.head()

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:

# Calculating BMI
 dataset['BMI'] = (dataset['Weight'] / ((dataset['Height'] / 100) ** 2)).round(0)

# Creating and displaying the BMI distribution graph
 plt.figure(figsize=(10, 6))
 sns.histplot(dataset['BMI'], bins=20, kde=True)
 plt.title('Distribution of Body Mass Index (BMI) of Patients')
 plt.xlabel('BMI')
 plt.ylabel('Frequency')
 plt.grid(True)
 plt.show()
except Exception as e:
 print(f"An error occurred: {e}")

In [8]:
# Removing the Weight and Height variables from the dataset
dataset = dataset.drop(['Weight', 'Height'], axis=1)

# Displaying the first few lines of the dataset to check the changes
dataset.head()

In [7]:
# New column with the first 5 digits of 'Patient_ID'
dataset['Patient_Hist_Code'] = dataset['Patient_ID'].str[:5]

# Displaying the first few lines of the dataset to check the new column
dataset.head()

In [9]:
# Unique values in the 'Patient_Hist_Code' column
unique_codes = dataset['Patient_Hist_Code'].nunique()

In [10]:
# Defining age groups
bins = [0, 20, 30, 45, 50, 100]
labels = ['0–20', '20–30', '30–40', '40–50', '50+']

# Using pandas cut function to create categories
dataset['Age_Group'] = pd.cut(dataset['Age'], bins=bins, labels=labels, right=False)

# Displaying the first few lines of the dataset to check the new column
dataset.head()

In [11]:
import pandas as pd
# Contingency table between 'Smoker' and 'Developed_Pneumonia'
contingency_table = pd.crosstab(dataset['Smoker'], dataset['Developed_Pneumonia'])
contingency_table

In [12]:
from scipy.stats import chi2_contingency

# Creating the contingency table for smoker data and development of pneumonia
contingency_table = [[278, 268], [220, 234]]

# Performing the Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
chi2, p

In [13]:
# Defining the ranges
bins = [-1, 2, 4, dataset['Number_of_Children'].max()]

# Defining labels for the ranges
labels = ['0–2', '3–4', '5+']

# Creating the new categorical column
dataset['Children_Range'] = pd.cut(dataset['Number_of_Children'], bins=bins, labels=labels, right=True)

# Displaying the first few lines of the dataset to check the new column
dataset.head()

In [14]:
# Columns to be removed
columns_to_remove = ['Patient_ID', 'Birth_Date', 'Age', 'Number_of_Children', 'Patient_Hist_Code']

# Removing the columns
clean_dataset = dataset.drop(columns=columns_to_remove)
clean_dataset.head()

In [15]:
# List to store results
results = []

# Categorical columns
categorical_columns = clean_dataset.select_dtypes(include=[pd.np.object]).columns.tolist()
categorical_columns.remove('Developed_Pneumonia')

# Loop through all categorical columns
for col in categorical_columns:

# Creating the contingency table
 contingency_table = pd.crosstab(clean_dataset[col], clean_dataset['Developed_Pneumonia'])

# Performing the chi-square test
 chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Adding the results to the list
 results.append({'Variable': col, 'p-value': p_value})

# Converting the results into a DataFrame for easier visualization
results_df = pd.DataFrame(results)
results_df

In [16]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Creating the encoders
le = LabelEncoder()
ohe = OneHotEncoder(drop='first', sparse=False)

# Encoding the target variable
clean_dataset['Developed_Pneumonia'] = le.fit_transform(clean_dataset['Developed_Pneumonia'])

# List to store the new categorical columns
new_categorical_cols = []

# Loop through all categorical columns
for col in categorical_columns:
# Encoding the column
 encoded_cols = ohe.fit_transform(clean_dataset[[col]])
# Transforming the result into a DataFrame and adding it
 encoded_cols_df = pd.DataFrame(encoded_cols, columns=[f"{col}_{category}" for category in ohe.categories_[0][1:]])
 # Adding the resulting DataFrame to the list
 new_categorical_cols.append(encoded_cols_df)

# Concatenating all DataFrames from the list
new_categorical_cols_clean_dataset = pd.concat(new_categorical_cols, axis=1)

# Removing the original categorical columns from the DataFrame
clean_dataset = clean_dataset.drop(categorical_columns, axis=1)

# Adding the new encoded categorical columns
df = pd.concat([clean_dataset, new_categorical_cols_clean_dataset], axis=1)

# First few lines
df.head()

In [17]:
# Applying One-Hot Encoding to 'age_group' and 'Children_Range'
df = pd.get_dummies(df, columns=['age_group', 'Children_Range'], drop_first=True)

# Viewing the first few lines of the updated DataFrame
print(df.head())

In [18]:
```python
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding to categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Separate features and target variable
X = df_encoded.drop('Developed_Pneumonia', axis=1)
y = df_encoded['Developed_Pneumonia']

# Create and train the Random Forest model
model = RandomForestClassifier()
model.fit(X, y)

# Select features based on importance
selector = SelectFromModel(model, threshold='median')
X_selected = selector.fit_transform(X, y)

# Get the names of selected features
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Apply One-Hot Encoding and separate features and target variable
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('Developed_Pneumonia', axis=1)
y = df['Developed_Pneumonia']

# Train the Random Forest model and select important features
model = RandomForestClassifier().fit(X, y)
selected_features = X.columns[SelectFromModel(model, threshold='median').fit(X, y).get_support()]

# Create a new DataFrame with selected features and the target variable
df_final = df_encoded[selected_features].join(df['Developed_Pneumonia'])

# Save the new DataFrame to a CSV file
df_final.to_csv('optimized_dataset.csv', index=False)
print("Optimized dataset saved as 'optimized_dataset.csv'.")