# Exploratory Data Analysis

## Configuration

In [None]:
# Check free memory available
%system free -m

In [None]:
# Import the necessary libraries

# Basic python libraries
import os
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Google cloud libraries
from google.cloud import bigquery
from google.cloud import storage

# Pandas and BigQuery
import pandas_gbq as pdg
import pandas as pd

In [None]:
# Current working directory
os.getcwd()

In [None]:
# Set output data and output locations
raw_data = "../data/raw/"
interim_data = "../data/interim/"
processed_data = "../data/processed/"

figure_path = "../reports/figures/"
config = "../config/"

## Data Loading

### other_covariates

In [None]:
# Set the filename for your CSV file
csv_filename = "other_covariates.csv"

# Combine the path and filename
csv_filepath = os.path.join(processed_data, csv_filename)

# Load the CSV file into a DataFrame
other_covariates_df = pd.read_csv(csv_filepath)

# Display the loaded DataFrame
other_covariates_df

### neet_chd

In [None]:
# Set the filename for your CSV file
csv_filename = "neet_chd.csv"

# Combine the path and filename
csv_filepath = os.path.join(processed_data, csv_filename)

# Load the CSV file into a DataFrame
neet_chd_df = pd.read_csv(csv_filepath)

# Display the loaded DataFrame
neet_chd_df

## Questions on NEET

### What is the distribution of NEET in the cohort?

In [None]:
# NEET status distribution
plt.figure(figsize=(8, 6))
neet_status_percentages = (neet_chd_df['ever_neet_status'].value_counts() / len(neet_chd_df)) * 100
neet_status_percentages.plot(kind='bar', color="#66b3ff")

# Add labels and title
plt.title("Distribution of NEET Status", fontsize=14)
plt.xlabel("NEET Status", fontsize=12)
plt.ylabel("Percentage", fontsize=12)

# Customize tick labels
plt.xticks(rotation=0, fontsize=10)
plt.yticks(np.arange(0, 101, 10), fontsize=10)  # Set y-ticks from 0 to 100 in steps of 10

# Add value labels on top of each bar
for index, value in enumerate(neet_status_percentages):
    plt.text(index, value + 1, f'{value:.2f}%', ha="center", fontsize=8)

# Set y-axis limit to 100
plt.ylim(0, 100)

# Remove the spines (top and right) and set grid lines only on the y-axis
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_percentage_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

### What is the distribution of NEET in the cohort by gender?

In [None]:
# NEET status distribution by gender
plt.figure(figsize=(12, 8))
ax = sns.countplot(x="ever_neet_status", hue="majority_gender", data=neet_chd_df, palette=["#66b3ff", "#ffb366", "#cc99ff", "#ffb3e6"])

# Add labels and title
plt.title("Distribution of NEET Status by Gender", fontsize=16)
plt.xlabel("NEET Status", fontsize=14)
plt.ylabel("Count", fontsize=14)

# Calculate the total counts for each NEET status to use for calculating percentages
total_counts = neet_chd_df.groupby('ever_neet_status').size()

# Calculate the total count of true values for NEET status
total_true_neet = total_counts[True]

# Add percentage value labels on top of each bar
for p in ax.patches:
    # Get the current count and calculate the percentage
    status = p.get_x() + p.get_width() / 2
    if status == 1:  # Adjust only for true NEET values
        total = total_true_neet
    else:
        total = total_counts[neet_chd_df['ever_neet_status'].unique()[int(status)]]
    percentage = 100 * p.get_height() / total
    # Place the annotation on the bar
    ax.text(p.get_x() + p.get_width() / 2., p.get_height() + 0.5, '{:1.1f}%'.format(percentage), ha="center", va="bottom", fontsize=10)

# Display legend
plt.legend(title="Gender", fontsize=12)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_gender_percentage_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display the plot
plt.show()

### What is the distribution of NEET by ethnic group?

In [None]:
# Merge datasets on 'person_id' to include ethnicity in the neet_congenital_df
# Do this operation only once to avoid duplicates
merged_df = pd.merge(neet_chd_df, other_covariates_df[['person_id', 'ethnicity']], on='person_id', how='left')
merged_df = merged_df.drop_duplicates(subset='person_id')

# Calculate NEET proportions by ethnicity
ethnicity_neet_counts = merged_df.groupby('ethnicity')['ever_neet_status'].value_counts(normalize=True).unstack() * 100

# Sort by NEET (True) in descending order
ethnicity_neet_proportions = ethnicity_neet_counts.sort_values(by=True, ascending=False)

# Plotting the horizontal bar chart for the NEET proportion of each ethnicity
plt.figure(figsize=(10, 8))
sns.barplot(y=ethnicity_neet_proportions.index, x=ethnicity_neet_proportions[True], color="#66b3ff", orient='h')
plt.title('Distribution of NEET Individuals by Ethnicity')
plt.ylabel('Ethnicity')
plt.xlabel('Percentage of NEET Individuals (%)')

# Annotate the bars with the percentage value
for index, value in enumerate(ethnicity_neet_proportions[True]):
    plt.text(value, index, f'{value:.2f}%', ha='center', va='center')
    
# Remove the spines (top and right) and set grid lines only on the x-axis
sns.despine()
plt.grid(axis="x", linestyle="--", alpha=0.5)

plt.tight_layout()  # Adjust layout to fit everything nicely

# Save the figure
figure_name = "neet_ethnicity_percentage_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Show the plot
plt.show()

### What is the proportion of NEET in each academic year?

In [None]:
# NEET status observation by unique person across academic year
plt.figure(figsize=(12, 8))

# Merge the dataframes on 'person_id'
merged_df = pd.merge(neet_chd_df, other_covariates_df, on='person_id', how='inner')

# Drop duplicate person_id, neet_yn, academic_year combinations to count unique persons
unique_persons_df = merged_df.drop_duplicates(subset=['person_id', 'neet_yn', 'academic_year'])

# Calculate the total count of individuals for each academic year
academic_year_counts = unique_persons_df['academic_year'].value_counts()

# Calculate the count of NEET individuals for each academic year
neet_counts = unique_persons_df.groupby('academic_year')['neet_yn'].sum()

# Calculate the percentage of NEET individuals for each academic year
neet_percentage = (neet_counts / academic_year_counts) * 100

# Convert to DataFrame for plotting
neet_percentage_df = neet_percentage.reset_index()
neet_percentage_df.columns = ['academic_year', 'neet_percentage']

# Sort academic years in ascending order
neet_percentage_df.sort_values(by='academic_year', inplace=True)

ax = sns.barplot(x="academic_year", y="neet_percentage", data=neet_percentage_df, color="#66b3ff")

# Add labels and title
plt.title("Percentage of NEET in each Academic Year", fontsize=14)
plt.xlabel("Academic Year", fontsize=12)
plt.ylabel("Percentage", fontsize=12)

# Customize tick labels
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Add value labels on top of each bar
for p in ax.patches:
    height = p.get_height()
    plt.text(p.get_x() + p.get_width() / 2, height + 0.5, f"{height:.2f}%", ha="center", fontsize=8)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_academic_year_percentage_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

### What is the distribution of NEET reasons (activity codes)?

In [None]:
# Filter the DataFrame for True values of neet_yn
neet_activity_df = other_covariates_df[other_covariates_df['neet_yn']]

# NEET status distribution of current_activity_code for True values only
plt.figure(figsize=(12, 6))
ax = sns.countplot(x="current_activity_code", data=neet_activity_df, palette=["#66b3ff", "#ffb366", "#cc99ff", "#ffb3e6", "#99ff99", "#ff6666"])

# Add labels and title
plt.title("Distribution of NEET activity code", fontsize=14)
plt.xlabel("Activity Code", fontsize=12)
plt.ylabel("Count", fontsize=12)

# Customize tick labels and set them at an angle of 90 degrees
plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)

# Add value labels on top of each bar
for p in ax.patches:
    count = format(int(p.get_height()), ",")
    ax.text(p.get_x() + p.get_width() / 2, p.get_height() + 50, count, ha="center", fontsize=8)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_reasons_count_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

### What is the distribution of ever_neet_status and persistent_neet_status?

In [None]:
# List of columns to consider
columns_to_plot = ['ever_neet_status', 'persistent_neet_status']
titles = ["Ever NEET Distribution", "Persistent NEET Distribution"]  # Titles for each subplot

# Create subplots for each column
fig, axes = plt.subplots(nrows=1, ncols=len(columns_to_plot), figsize=(12, 6))

# Check if there's only one column to plot and ensure 'axes' is iterable
if len(columns_to_plot) == 1:
    axes = [axes]

# Iterate through each column and create a bar chart
for i, column in enumerate(columns_to_plot):
    ax = axes[i]
    
    # Calculate the percentages of True and False values
    total_entries = len(neet_chd_df)
    true_percentage = (neet_chd_df[column].sum() / total_entries) * 100
    false_percentage = 100 - true_percentage
    
    # Plot the bar chart for the percentages of True and False values
    sns.barplot(x=['False', 'True'], y=[false_percentage, true_percentage], ax=ax, palette=["#66b3ff", "#ffb366"])
    ax.set_title(titles[i], fontsize=16)
    ax.set_ylabel("Percentage", fontsize=14)
    
    # Set the y-axis to range from 0% to 100%
    ax.set_ylim(0, 100)
    
    # Add percentage annotations on each bar
    for bar, percentage in zip(ax.patches, [false_percentage, true_percentage]):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percentage:.1f}%',
                ha='center', va='bottom', color='black', fontsize=8)

    # Remove the spines (top and right) and set grid lines only on the y-axis
    sns.despine(ax=ax)
    ax.yaxis.grid(True, linestyle='--', alpha=0.5)

# Adjust layout
plt.tight_layout()

# Save the figure
figure_name = "neet_statuses_distribution.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

## Questions on NEET and CHD?

### What is the count of persons with CHD in the cohort?

In [None]:
# Bar Chart for Congenital Anomalies distribution
plt.figure(figsize=(4, 6))

# Use plt.bar with custom x and height values
counts = neet_chd_df['snomedcode'].notnull().value_counts()
plt.bar([0, 1], counts, color="#66b3ff")

# Add labels and title
plt.title("Congenital Heart Diseases in the Cohort", fontsize=14)
plt.xlabel("Congenital Heart Defect", fontsize=12)
plt.ylabel("Count", fontsize=12)

# Customize tick labels
plt.xticks(ticks=[0, 1], labels=['False', 'True'], fontsize=10)
plt.yticks(fontsize=10)

# Add value labels on top of each bar
for x, count in zip([0, 1], counts):
    count_str = format(int(count), ",")
    plt.text(x, count + 50, count_str, ha="center", fontsize=8)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "congenital_count_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

### What is the percentage of persons with congenital heart defect in the cohort?

In [None]:
# Data for the pie chart
total_count = len(neet_chd_df)
with_chd_count = neet_chd_df['chd_status'].sum()
without_chd_count = total_count - with_chd_count

labels = [f'With CHD\n({with_chd_count})', f'Without CHD\n({without_chd_count})']
sizes = [with_chd_count, without_chd_count]
colors = ['#ffb366', '#66b3ff']
explode = (0.05, 0)  # Explode the first slice (Persons with Congenital Anomalies)

# Plot the pie chart
fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90, explode=explode)
#ax.set_title('Percentage of CHD in the Cohort', fontsize=16) 

# Save the figure
figure_name = "congenital_pie_chart.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

### What is the proportion of NEET statuses in the CHD and reference groups?

In [None]:
# Define a custom color palette
custom_palette = {True: '#ffb366', False: '#66b3ff'}

# Bar Chart for the distribution of NEET and CHD
plt.figure(figsize=(4, 6))
ax = sns.countplot(x='chd_status', hue='ever_neet_status', data=neet_chd_df, palette=custom_palette, dodge=True)

# Add labels and title
plt.title('Ever NEET and CHD Status', fontsize=14)
plt.xlabel('CHD Status', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Customize tick labels
plt.xticks(ticks=[0, 1], labels=['Without CHD', 'With CHD'], fontsize=10)
plt.yticks(np.arange(0, 90001, 10000), fontsize=10)

# Add value labels on top of each bar
for p in ax.patches:
    count = format(int(p.get_height()), ",")
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 50, count, ha="center", fontsize=8)

# Add legend with custom labels and colors
legend_labels = {True: 'Ever NEET', False: 'Never NEET'}
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=custom_palette[key], markersize=10) for key in legend_labels.keys()]
plt.legend(title='NEET Status', loc='upper right', title_fontsize='10', fontsize='8', labels=legend_labels, handles=legend_handles)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Set the y-axis range from 0 to 90,000
plt.ylim(0, 90000)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_ever_chd_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

In [None]:
# Define a custom color palette
custom_palette = {True: "#cc99ff", False: '#66b3ff'}

# Bar Chart for the distribution of NEET and CHD
plt.figure(figsize=(4, 6))
ax = sns.countplot(x='chd_status', hue='persistent_neet_status', data=neet_chd_df, palette=custom_palette, dodge=True)

# Add labels and title
plt.title('Persistent NEET and CHD Status', fontsize=14)
plt.xlabel('CHD Status', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Customize tick labels
plt.xticks(ticks=[0, 1], labels=['Without CHD', 'With CHD'], fontsize=10)
plt.yticks(fontsize=10)

# Add value labels on top of each bar
for p in ax.patches:
    count = format(int(p.get_height()), ",")
    plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 50, count, ha="center", fontsize=8)

# Add legend with custom labels and colors
legend_labels = {True: 'Persistent NEET', False: 'Never NEET'}
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=custom_palette[key], markersize=10) for key in legend_labels.keys()]
plt.legend(title='NEET Status', loc='upper right', title_fontsize='10', fontsize='8', labels=legend_labels, handles=legend_handles)

# Remove the spines (top and right) and set grid lines only on the y-axis
sns.despine()
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Set the y-axis range from 0 to 90,000
plt.ylim(0, 90000)

# Adjust the layout to prevent x-axis labels from being cut off
plt.tight_layout()

# Save the figure
figure_name = "neet_persistent_chd_plot.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()

## Questions on Home LSOA in Bradford?

### What is the count/percentage of individuals in Bradford?

In [None]:
# Filter the DataFrame for True values in 'lsoa_bradford'
filtered_df = neet_chd_df[neet_chd_df['home_lsoa_in_bradford'] == True]

# Calculate the percentage of True values
true_percentage = (len(filtered_df) / len(neet_chd_df)) * 100

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie([len(neet_chd_df) - len(filtered_df), len(filtered_df)], labels=['False', f'True\n({len(filtered_df)})'], autopct='%1.2f%%', startangle=90, colors=["#66b3ff", "#ffb366"], explode=(0.05, 0))
plt.title("Percentage and Count of People in Bradford")

# Save the figure
figure_name = "neet_bradford_pie_chart.png"
full_path = figure_path + figure_name
plt.savefig(full_path)

# Display
plt.show()