# 0) Import libraries and data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_excel('../data/train_set.xlsx')
test_df = pd.read_excel('../data/test_set.xlsx')

# 1) Histograms of the features

In [None]:
# determine the grid size
n_rows = 4
n_cols = 3

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 12))

# flatten the axes array for easy iteration
axes = axes.flatten()

# loop through each column and plot
for i, column in enumerate(train_df.columns):
    ax = axes[i]
    train_df[column].hist(bins=50, ax=ax)
    ax.set_title(f'Histogram of {column} in the training set')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')

# hide unused subplots
for j in range(i+1, n_rows*n_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 12))

# flatten the axes array for easy iteration
axes = axes.flatten()

# loop through each column and plot
for i, column in enumerate(test_df.columns):
    ax = axes[i]
    test_df[column].hist(bins=50, ax=ax)
    ax.set_title(f'Histogram of {column} in the testing set')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')

# hide unused subplots
for j in range(i+1, n_rows*n_cols):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# 2) Summary statistics

In [None]:
print("Training set: ", train_df.describe())

In [None]:
print("Test set: ", test_df.describe())

# 3) Feature correlation

In [None]:
correlation_matrix_train = train_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_train, annot=True, cmap='coolwarm')
plt.title('Correlation matrix for training set', fontsize=18)
plt.show()

In [None]:
correlation_matrix_test = test_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_test, annot=True, cmap='coolwarm')
plt.title('Correlation matrix for testing set', fontsize=18)
plt.show()

# 4) Check for missing values

In [None]:
print(train_df.isnull().sum())

In [None]:
print(test_df.isnull().sum())