In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('/mnt/data/bank-additional-full.csv', sep=';')

# Display basic information
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check class distribution
print("\nTarget Variable Distribution:")
print(df['y'].value_counts())

# Plot target variable
sns.countplot(x='y', data=df)
plt.title('Target Variable Distribution')
plt.show()

# Plot correlation heatmap for numerical features
numeric_features = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_features.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Plot categorical features distribution
categorical_features = df.select_dtypes(include=['object']).columns[:-1]  # exclude target
for col in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Encode categorical variables using one-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# Display final preprocessed dataset info
print("\nPreprocessed Dataset Info:")
print(df_encoded.info())

# Save the preprocessed data (optional)
# df_encoded.to_csv('/mnt/data/bank_preprocessed.csv', index=False)
