In [None]:
from models.databank_analysis import get_fresh_baseball_df, get_numerical_df

# create main baseball dataframe
bb_df = get_fresh_baseball_df()

# create dataframe subset with numerical columns
num_df = get_numerical_df(bb_df)

num_df.head(25)

In [None]:
import pandas as pd
from helpers.analysis_utils import get_correlation_analysis_dict

# get correlation coefficients and p-values
df = num_df.drop(columns=['bb_key']).copy()
corr_dict = get_correlation_analysis_dict(df, 'allstar_flag')

for k, v in corr_dict.items():
    print(f'{k}: {v}')

In [None]:
# create dataframe with dictionary
corr_df = pd.DataFrame({
    'Baseball Databank Field': [k for k in corr_dict.keys()],
    'Correlation Coefficient': [round(v[0], 4) for k, v in corr_dict.items()],
    'P-Value': [round(v[1], 4) for k, v in corr_dict.items()]
})
corr_df.head(10)

In [None]:
import numpy as np

# add derived columns to stats_df
corr_df['Statistically Significant'] = np.where(corr_df['P-Value'] < 0.01, True, False)
corr_df['Practically Significant'] = np.where(corr_df['Correlation Coefficient'].abs() >= 0.3, True, False)

condlist = [
    corr_df['Correlation Coefficient'].abs() == round(0.0, 4),
    corr_df['Correlation Coefficient'].abs() == round(1.0, 4),
    corr_df['Correlation Coefficient'].abs() < 0.3,
    corr_df['Correlation Coefficient'].abs() < 0.6,
    corr_df['Correlation Coefficient'].abs() < 0.9,
    corr_df['Correlation Coefficient'].abs() < 1.0
]
choicelist = [
    'No Correlation',
    'Perfect Correlation',
    np.where(corr_df['Correlation Coefficient'] > 0.0, 'Low (Positive)', 'Low (Negative)'),
    np.where(corr_df['Correlation Coefficient'] > 0.0, 'Moderate (Positive)', 'Moderate (Negative)'),
    np.where(corr_df['Correlation Coefficient'] > 0.0, 'High (Positive)', 'High (Negative)'),
    np.where(corr_df['Correlation Coefficient'] > 0.0, 'Very High (Positive)', 'Very High (Negative)')
]
corr_df['Magnitude'] = np.select(condlist, choicelist, default=pd.NA)

corr_df.head(10)

In [None]:
# Get top 20 fields by correlation coefficient
corr_df['abs_corr'] = corr_df['Correlation Coefficient'].abs()
top20 = corr_df.nlargest(20, 'abs_corr')['Baseball Databank Field'].tolist()
top20.append('allstar_flag')
corr_df.drop(columns=['abs_corr'], inplace=True)

print(top20)

In [None]:
from helpers.analysis_utils import get_correlation_matrix

# plot correlation matrix / heatmap
get_correlation_matrix(
    dataframe=df[top20],
    plot_title='Correlation Matrix: MLB All-Star Roster Analysis | Baseball Databank'
)