## Exploratory Data Analysis
* In addition to dealing with missing data, scaling, encoding, etc.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.feature_selection import r_regression, f_regression
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import f
from sklearn import datasets, linear_model
from sklearn.metrics import r2_score
from sklearn.preprocessing import QuantileTransformer
import statsmodels.api as sm
from scipy import stats
from scipy.stats import boxcox, yeojohnson
directory = 'data/graphs'
import os
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
df = pd.read_csv("data/ModifiedSummaryByNeighborhood.csv") 

In [None]:
df.skew()

In [None]:
df.info()

#### Calculate feature correlations and display a correlation heat-map

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
corr = df.corr(method='pearson')
sns.set(font_scale=1)
plt.figure(figsize=(16,12))
sns_plot = sns.heatmap(
    corr,        
    cmap='RdBu_r', 
    annot=True, 
    vmin=-1, vmax=1);


In [None]:
corr = df.corr(method='spearman')
sns.set(font_scale=1)
plt.figure(figsize=(16,12))
sns_plot = sns.heatmap(
    corr,        
    cmap='RdBu_r', 
    annot=True, 
    vmin=-1, vmax=1);

In [None]:
columns = df.columns
for idx in range(len(columns)):
    x_value = columns[idx]
    t_value = columns[idx].title()
    fig = plt.figure(constrained_layout=True)
    gs = GridSpec(2, 2, figure=fig)
    # create sub plots as grid
    ax1 = fig.add_subplot(gs[0, :])
    sns.scatterplot(data=df,x=x_value, y='er_visits')
    ax2 = fig.add_subplot(gs[1, 0])
    sns.histplot(x=x_value,data=df,bins=16)
    ax3 = fig.add_subplot(gs[1, 1])
    sns.boxplot(data=df,x=x_value,orient='h')
 
    # depict illustration
    fig.suptitle(t_value)
    fig.savefig('data/graphs/' + x_value + '.png', format='png')
    print('\n')

In [None]:
abs_corr = df.corr()['er_visits'].apply(lambda x: abs(x))
abs_corr.sort_values(ascending=False, inplace=True)
abs_corr.drop(index='er_visits', axis=1, inplace=True)

In [None]:
abs_corr

In [None]:
violations_skew = df['violations'].skew()
electrical_skew = df['electrical'].skew()

#### Build visualizations of features

In [None]:
fig,ax = plt.subplots(1,2,figsize=(14,6))
sns.histplot(df['violations'], kde=True, ax=ax[0], color='skyblue')
ax[0].set_title('Distribution of Violations (Positive Skew)', fontsize=16)
ax[0].set_xlabel('Violations')
ax[0].set_ylabel('Frequency')
#Annotate Skewness
ax[0].text(0.5, 0.5, f'Skew: {violations_skew:.2f}', transform=ax[0].transAxes,
          horizontalalignment='center', color='black', weight='bold', fontsize=14)

sns.histplot(df['electrical'], kde=True, ax=ax[1], color='salmon')
ax[1].set_title('Distribution of Electrical Complaints (Positive Skew)', fontsize=16)
ax[1].set_xlabel('Electrical Complaints')
ax[1].set_ylabel('Frequency')
#Annotate Skewness
ax[1].text(0.5, 0.5, f'Skew: {electrical_skew:.2f}', transform=ax[1].transAxes,
          horizontalalignment='center', color='black', weight='bold', fontsize=14)
plt.tight_layout()
plt.show()

#### Data Transformations
- Parametric tests expect a normal distribution
- Transformations can reduce the impact of outliers on models 

In [None]:
# Applying Log Transformation
df['log_violations'] = np.log(df['violations'])
print(f"Violations Skewness after Log Transformation: {df['log_violations'].skew():.5f}")
df['log_electrical'] = np.log(df['electrical'])
print(f"Electrical Skewness after Log Transformation: {df['log_electrical'].skew():.5f}")

* Moderate positive skew can be transformed by applying a square root to each data point

In [None]:
# Applying Square Root Transformation
df['sqrt_violations'] = np.sqrt(df['violations'])
print(f"Violations Skewness after Square Root Transformation: {df['sqrt_violations'].skew():.5f}")
df['sqrt_electrical'] = np.sqrt(df['electrical'])
print(f"Electrical Skewness after Square Root Transformation: {df['sqrt_electrical'].skew():.5f}")

In [None]:
# Applying Box-Cox Transformation
df['bc_violations'],_ = boxcox(df['violations'])
print(f"Violations Skewness after Box-Cox Transformation: {df['bc_violations'].skew():.5f}")
df['bc_electrical'],_ = boxcox(df['electrical'])
print(f"Electrical Skewness after Box-Cox Transformation: {df['bc_electrical'].skew():.5f}")

* Note:  above transformations only work with positive data. 
The Yeo-Johnson and Quantile Transformers can handle negative data

In [None]:
# Applying Yeo-Johnson Transformation
df['yj_violations'],_ = yeojohnson(df['violations'])
print(f"Violations Skewness after Yeo-Johnson Transformation: {df['yj_violations'].skew():.5f}")
df['yj_electrical'],_ = yeojohnson(df['electrical'])
print(f"Electrical Skewness after Yeo-Johnson Transformation: {df['yj_electrical'].skew():.5f}")

In [None]:
# Applying Quantile Transformation to follow a normal distribution
qv_transformer = QuantileTransformer(output_distribution="normal",n_quantiles=len(df['violations']), random_state=0)
df['q_violations'] = qv_transformer.fit_transform(df['violations'].values.reshape(-1,1)).flatten()
print(f"Violations Skewness after Quantile Transformation: {df['q_violations'].skew():.5f}")
qe_transformer = QuantileTransformer(output_distribution="normal",n_quantiles=len(df['electrical']), random_state=0)
df['q_electrical'] = qe_transformer.fit_transform(df['electrical'].values.reshape(-1,1)).flatten()
print(f"Electrical Skewness after Quantile Transformation: {df['q_electrical'].skew():.5f}")

In [None]:
fig,ax = plt.subplots(1,2,figsize=(14,6))
sns.histplot(df['q_violations'], kde=True, ax=ax[0], color='skyblue')
ax[0].set_title('Distribution of Violations (After Quantile Transform)', fontsize=16)
ax[0].set_xlabel('Violations')
ax[0].set_ylabel('Frequency')
#Annotate Skewness
ax[0].text(0.5, 0.5, f"Skew: {df['q_violations'].skew():.5f}", transform=ax[0].transAxes,
          horizontalalignment='center', color='black', weight='bold', fontsize=14)

sns.histplot(df['q_electrical'], kde=True, ax=ax[1], color='salmon')
ax[1].set_title('Distribution of Electrical Complaints (After Quantile Transform)', fontsize=16)
ax[1].set_xlabel('Electrical Complaints')
ax[1].set_ylabel('Frequency')
#Annotate Skewness
ax[1].text(0.5, 0.5, f"Skew: {df['q_electrical'].skew():.5f}", transform=ax[1].transAxes,
          horizontalalignment='center', color='black', weight='bold', fontsize=14)
plt.tight_layout()
plt.show()

* For negative skewed data, try Squared  (raise each data point to the power of 2), Cubed, Box-Cox, Yeo-Johnson, and Quantile Transformations