### First steps

In [None]:
# Import needed libraries
import os
import pandas as pd
import numpy as np
import termplotlib as tpl

In [None]:
# Define the location of the data
meta_path = '../../data/day_2/boston_housing/meta.txt'
data_path = '../../data/day_2/boston_housing/housing_data.csv'

In [None]:
# Have a look at the file size
print("Size of metadata:", os.path.getsize(meta_path), "Bytes")
print("Size of data:", os.path.getsize(data_path), "Bytes")

In [None]:
# Importing the data

# Loading the metadata
with open(meta_path, 'r', encoding='utf-8') as f:
    meta = f.read()

# Loading the dataset as DataFrame
df = pd.read_csv(data_path)

In [None]:
# Print the metadata
print(meta)

In [None]:
# Show the first 5 rows of the data
df.head()

In [None]:
# Datatypes of the columns
print(df.dtypes)

In [None]:
# Data dimensions
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

### Quality Criteria

In [None]:
# Measuring completeness
missing_count = df.isna().sum()
print(missing_count)
missing_rate = df.isna().mean()
print(missing_rate)

In [None]:
# Measuring uniqueness
duplicate_rate = df.duplicated()
print(duplicate_rate) # Percent

### Descriptive statistics

In [None]:
# Measuring cardinality
distinct_vals = df.nunique()
print(distinct_vals) # Count

In [None]:
# Extract columns with low cardinality and analyze their values

# Define threshold for low cardinality
cardinality_threshhold = 0.66
cutoff = cardinality_threshhold * len(df)
low_cardinality_columns = distinct_vals[distinct_vals < cutoff].index.tolist()
print("Selected low cardinality coluns: ", low_cardinality_columns)

# Print the value counts for each column
for col in low_cardinality_columns:
    print("")
    print(f"Spalte: {col}")
    print(df[col].value_counts())
    print("")

In [None]:
# Extract high cardinality columns and analyze their values

# Derive high cardinality columns
high_cardinality_columns = distinct_vals[distinct_vals > cutoff].index.tolist()
print("Selected high cardinality coluns: ", high_cardinality_columns)

# Set a bucket number and show the values counts of buckets
num_buckets = 16

for col in high_cardinality_columns:
    print("")
    print(f"Spalte: {col}")
    buckets = pd.cut(df[col], bins=num_buckets)
    bucket_counts = buckets.value_counts().sort_index()
    print(bucket_counts)
    print("")

In [None]:
# Visualize the discrete distribution of RM values

# Get the RM column values as an array
rm_data = df["RM"].values

# Calculate the histogram for RM
counts, bin_edges = np.histogram(rm_data, bins=64)

# Show the histogram
fig = tpl.figure()
fig.hist(counts, bin_edges, grid=[15, 40], force_ascii=True)
fig.show()

In [None]:
# Show a contigency table for CHAS and RAD
contingency_table = pd.crosstab(df['CHAS'], df['RAD'], normalize="index")
print(contingency_table)

In [None]:
# Show a correlation matrix of all columns except CHAS and RAD
corr_matrix = df[df.columns.difference(['CHAS', 'RAD'])].corr()
print(corr_matrix)

In [None]:
# Show the top five results of the correlation matrix

# Unstack correlation matrix to long format
corr_long = corr_matrix.unstack().reset_index()
corr_long.columns = ['variable_1', 'variable_2', 'correlation']

# Remove self correlations (e.g., A vs A)
corr_long = corr_long[corr_long['variable_1'] != corr_long['variable_2']]

# Remove duplicate pairs (because matrix is symmetric)
corr_long['pairs'] = corr_long.apply(lambda row: tuple(sorted([row['variable_1'], row['variable_2']])), axis=1)
corr_long = corr_long.drop_duplicates(subset='pairs')

# Sort by absolute correlation value
corr_long['abs_correlation'] = corr_long['correlation'].abs()
top5 = corr_long.sort_values(by='abs_correlation', ascending=False).head(5)

# Output (original correlation values)
print(top5[['variable_1', 'variable_2', 'correlation']])