In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from scipy import stats


In [6]:
pip install celluloid

Collecting celluloid
  Downloading celluloid-0.2.0-py3-none-any.whl.metadata (4.8 kB)
Downloading celluloid-0.2.0-py3-none-any.whl (5.4 kB)
Installing collected packages: celluloid
Successfully installed celluloid-0.2.0


In [3]:
# Import the necessary libraries
import pandas as pd
from google.colab import files

# Browse and upload the file
uploaded = files.upload()

# Assuming you upload one file, get its file name
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

# Load the CSV file into a DataFrame
df = pd.read_csv(file_name)


Saving Mental_Health_Care_in_the_Last_4_Weeks (2).csv to Mental_Health_Care_in_the_Last_4_Weeks (2).csv
Uploaded file: Mental_Health_Care_in_the_Last_4_Weeks (2).csv


In [None]:
df.shape

(10404, 15)

In [None]:
# Display the first few rows of the DataFrame
print(df.head())

# Display the first few rows of the DataFrame
print(df.tail())


                                           Indicator              Group  \
0  Took Prescription Medication for Mental Health...  National Estimate   
1  Took Prescription Medication for Mental Health...             By Age   
2  Took Prescription Medication for Mental Health...             By Age   
3  Took Prescription Medication for Mental Health...             By Age   
4  Took Prescription Medication for Mental Health...             By Age   

           State       Subgroup Phase  Time Period      Time Period Label  \
0  United States  United States     2           13  Aug 19 - Aug 31, 2020   
1  United States  18 - 29 years     2           13  Aug 19 - Aug 31, 2020   
2  United States  30 - 39 years     2           13  Aug 19 - Aug 31, 2020   
3  United States  40 - 49 years     2           13  Aug 19 - Aug 31, 2020   
4  United States  50 - 59 years     2           13  Aug 19 - Aug 31, 2020   

  Time Period Start Date Time Period End Date  Value  LowCI  HighCI  \
0             0

In [None]:
df.describe()

Unnamed: 0,Time Period,Value,LowCI,HighCI,Suppression Flag
count,10404.0,9914.0,9914.0,9914.0,22.0
mean,28.134948,17.450736,14.771565,20.475661,1.0
std,11.04021,8.270565,7.659396,9.052521,0.0
min,1.0,1.4,0.8,2.0,1.0
25%,20.0,10.3,8.0,12.9,1.0
50%,29.0,16.2,13.9,19.2,1.0
75%,37.0,24.0,20.8,27.4,1.0
max,45.0,62.9,53.2,71.9,1.0


In [None]:
df.nunique()

Unnamed: 0,0
Indicator,4
Group,10
State,52
Subgroup,80
Phase,8
Time Period,34
Time Period Label,38
Time Period Start Date,38
Time Period End Date,38
Value,439


PREPROCESSING OF DATA

In [None]:
import pandas as pd

def preprocess_mental_health_data(df):
    # Drop columns that may not be necessary for analysis
    df = df.drop(columns=[ 'Suppression Flag','Time Period'], errors='ignore')

    # Convert Time Period Start Date and End Date to datetime
    df['Time Period Start Date'] = pd.to_datetime(df['Time Period Start Date'], format='%m/%d/%Y', errors='coerce')
    df['Time Period End Date'] = pd.to_datetime(df['Time Period End Date'], format='%m/%d/%Y', errors='coerce')

    # Rename columns for clarity
    df = df.rename(columns={
        'Value': 'Mental_Health_Value',
        'LowCI': 'Lower_Confidence_Interval',
        'HighCI': 'Upper_Confidence_Interval'
    })

    # Handle Numerical missing values by filling with mean
    df['Mental_Health_Value'] = df['Mental_Health_Value'].fillna(df['Mental_Health_Value'].mean())
    df['Lower_Confidence_Interval'] = df['Lower_Confidence_Interval'].fillna(df['Lower_Confidence_Interval'].mean())
    df['Upper_Confidence_Interval'] = df['Upper_Confidence_Interval'].fillna(df['Upper_Confidence_Interval'].mean())
    df['Time Period Start Date'] = df['Time Period Start Date'].fillna(df['Time Period Start Date'].mean())
    df['Time Period End Date'] = df['Time Period End Date'].fillna(df['Time Period End Date'].mean())

    # Handle Categorical missing values by filling with mode
    df['State'] = df['State'].fillna(df['State'].mode()[0])
    df['Subgroup'] = df['Subgroup'].fillna(df['Subgroup'].mode()[0])
    df['Group'] = df['Group'].fillna(df['Group'].mode()[0])



    return df



# Apply the preprocessing function
preprocessed_data = preprocess_mental_health_data(df)

# Show the first few rows of the preprocessed data
print(preprocessed_data.head())

# Show the last few rows of the preprocessed data
print(preprocessed_data.tail())

                                           Indicator              Group  \
0  Took Prescription Medication for Mental Health...  National Estimate   
1  Took Prescription Medication for Mental Health...             By Age   
2  Took Prescription Medication for Mental Health...             By Age   
3  Took Prescription Medication for Mental Health...             By Age   
4  Took Prescription Medication for Mental Health...             By Age   

           State       Subgroup Phase      Time Period Label  \
0  United States  United States     2  Aug 19 - Aug 31, 2020   
1  United States  18 - 29 years     2  Aug 19 - Aug 31, 2020   
2  United States  30 - 39 years     2  Aug 19 - Aug 31, 2020   
3  United States  40 - 49 years     2  Aug 19 - Aug 31, 2020   
4  United States  50 - 59 years     2  Aug 19 - Aug 31, 2020   

  Time Period Start Date Time Period End Date  Mental_Health_Value  \
0             2020-08-19           2020-08-31                 19.4   
1             2020-08-19

FINDING OUTLIERS USING Z SCORE


In [None]:
from scipy import stats
import numpy as np

def find_outliers_zscore(df, threshold=3):
    # Select numerical columns
    numerical_columns = df.select_dtypes(include=np.number).columns

    # Calculate Z-scores for each value in numerical columns
    z_scores = np.abs(stats.zscore(df[numerical_columns]))

    # Identify where the Z-score is greater than the threshold
    outliers = (z_scores > threshold).any(axis=1)

    # Return the rows that are considered outliers
    outlier_data = df[outliers]
    return outlier_data

# Find outliers using Z-Score
outliers_zscore = find_outliers_zscore(preprocessed_data)
print(outliers_zscore)

                                               Indicator  \
731    Took Prescription Medication for Mental Health...   
1307   Took Prescription Medication for Mental Health...   
1595   Took Prescription Medication for Mental Health...   
2459   Took Prescription Medication for Mental Health...   
3695   Took Prescription Medication for Mental Health...   
...                                                  ...   
10256  Took Prescription Medication for Mental Health...   
10259  Took Prescription Medication for Mental Health...   
10261  Took Prescription Medication for Mental Health...   
10271  Took Prescription Medication for Mental Health...   
10336  Needed Counseling or Therapy But Did Not Get I...   

                                               Group          State  \
731    By Presence of Symptoms of Anxiety/Depression  United States   
1307   By Presence of Symptoms of Anxiety/Depression  United States   
1595   By Presence of Symptoms of Anxiety/Depression  United State

Outliers can be identified using the Z-score, which measures how many standard deviations a data point is from the mean

UNIVARIATE ANALYSIS

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display

# Make sure plots are displayed inline in Jupyter notebooks
%matplotlib inline

# Function for interactive univariate analysis
def univariate_analysis_interactive(df, graph_type, column):
    plt.figure(figsize=(10, 5))  # Create a new figure

    # Numerical columns
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

    # Categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    if column in numerical_columns:
        if graph_type == 'Histogram':
            plt.hist(df[column], bins=30, color='skyblue', edgecolor='black')
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')

        elif graph_type == 'Boxplot':
            sns.boxplot(data=df, x=column, color='lightgreen')
            plt.title(f'Boxplot of {column}')

        plt.show()  # Ensure plot is displayed after creation

    elif column in categorical_columns:
        if graph_type == 'Bar Plot':
            plt.figure(figsize=(15, 4))  # Create a separate figure for categorical data
            df[column].value_counts().plot(kind='bar', color='lightcoral')
            plt.title(f'Bar Plot of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.xticks(rotation=45)

            plt.show()  # Ensure plot is displayed after creation

# Function to create interactive widgets
def create_widgets_for_univariate(df):
    # Dropdown for columns (both numerical and categorical)
    columns = df.columns.tolist()

    # Dropdown for graph types
    graph_types = ['Histogram', 'Boxplot', 'Bar Plot']

    column_selector = widgets.Dropdown(
        options=columns,
        description='Select Column:',
        style={'description_width': 'initial'}
    )

    graph_selector = widgets.Dropdown(
        options=graph_types,
        description='Select Graph Type:',
        style={'description_width': 'initial'}
    )

    # Use interact to link widgets to the function
    interact(univariate_analysis_interactive, df=widgets.fixed(df),
             graph_type=graph_selector, column=column_selector)

# Apply the function to the preprocessed data
create_widgets_for_univariate(preprocessed_data)


interactive(children=(Dropdown(description='Select Graph Type:', options=('Histogram', 'Boxplot', 'Bar Plot'),…

<Figure size 1000x500 with 0 Axes>

**Numerical Data:**
**Histogram**:The histogram helps visualize the frequency distribution of a numeric variable.
**Boxplot:**The boxplot helps in identifying the median, interquartile range (IQR), and outliers of a numerical variable. It shows how the values of the variable are spread and whether there are any extreme values.                   
    **Categorical Data:**
    **Bar Plot:**
The bar plot shows the frequency of each category within a categorical variable. This gives insights into the most common and least common categories.

BIVARIATE ANALYSIS

VIOLIN PLOT

In [None]:

from google.colab import files
import pandas as pd
import plotly.express as px


# Ensure 'Subgroup' is a categorical type
df['Subgroup'] = df['Subgroup'].astype('category')

# Create a violin plot for the 'Value' column grouped by 'Subgroup'
fig = px.violin(df, x='Subgroup', y='Value',
                 title='Violin Plot of Mental Health Care Access by Age Group',
                 labels={'Value': 'Percentage of Individuals Receiving Care'},
                 color='Subgroup',
                 box=True,  # Adds a box plot inside the violin
                 points='all')  # Show all points to visualize individual data points

# Show the figure
fig.show()

The violin plot serves as an effective visual tool for examining the complexities of mental health care access across subgroups, allowing stakeholders to identify trends, disparities, and opportunities for improvement in service delivery.

FILTER

In [5]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
from IPython.display import display


# 1. Ensure 'State' column exists in the dataset (modify 'State' with the actual column name if needed)
if 'State' in df.columns:

    # Get the unique values from the 'State' column
    unique_states = df['State'].unique()

    # Create a function to filter the dataset based on the selected state
    def filter_by_state(selected_state):
        filtered_data = df[df['State'] == selected_state]
        display(filtered_data.head())  # Display filtered data

    # Create radio buttons for selecting a state
    state_radio_buttons = widgets.RadioButtons(
        options=unique_states,
        description='State:',
        disabled=False
    )

    # Use the interact function to connect the radio buttons to the filtering function
    interact(filter_by_state, selected_state=state_radio_buttons)

else:
    print("State column not found in the dataset.")



interactive(children=(RadioButtons(description='State:', options=('United States', 'Alabama', 'Alaska', 'Arizo…

SLIDERS

In [9]:
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Filter the dataset to include only 'By Age' from the 'Group' column
df_by_age = df[df['Group'] == 'By Age']

# Aggregate the data to represent the overall value for "United States"
df_by_age_us = df_by_age.groupby('Subgroup').agg({'Value': 'mean'}).reset_index()

# Function to update the graph based on the min and max 'Value'
def update_graph(min_value, max_value):
    # Filter data based on slider values
    filtered_data = df_by_age_us[(df_by_age_us['Value'] >= min_value) & (df_by_age_us['Value'] <= max_value)]

    # Create a bar chart with only "United States" on the x-axis
    fig = px.bar(filtered_data,
                 x=["United States"] * len(filtered_data),  # X-axis is always "United States"
                 y="Value",
                 color="Subgroup",  # Differentiates by Subgroup (age ranges)
                 title=f"Mental Health Care Usage in the United States for Value Range {min_value} - {max_value}")

    # Update the layout
    fig.update_layout(
        xaxis_title="Country",
        yaxis_title="Mental Health Care Usage (%)",
        title_font_size=20,
        title_x=0.5  # Center title
    )

    # Show the figure
    fig.show()

# Define the min and max values for sliders based on the 'Value' column
min_val = df_by_age_us['Value'].min()
max_val = df_by_age_us['Value'].max()

# Create sliders for min and max values
min_slider = widgets.FloatSlider(
    value=min_val,
    min=min_val,
    max=max_val,
    step=0.5,
    description='Min Value',
)

max_slider = widgets.FloatSlider(
    value=max_val,
    min=min_val,
    max=max_val,
    step=0.5,
    description='Max Value',
)

# Use `interact` to connect sliders to the update_graph function
widgets.interact(update_graph, min_value=min_slider, max_value=max_slider)

# Display the sliders
display(min_slider, max_slider)


interactive(children=(FloatSlider(value=10.308403361344538, description='Min Value', max=20.774242424242424, m…

FloatSlider(value=10.308403361344538, description='Min Value', max=20.774242424242424, min=10.308403361344538,…

FloatSlider(value=20.774242424242424, description='Max Value', max=20.774242424242424, min=10.308403361344538,…

The interactive line chart allows stakeholders to explore mental health care usage across different states while filtering by specified age ranges. This visualization can help identify trends and disparities in access to mental health services among various age groups. Adjusting the age sliders enables a more focused analysis of particular age segments, guiding targeted interventions and resource allocation. Ultimately, this approach supports data-driven decision-making in improving mental health outcomes across diverse populations.

PIE CHART

In [None]:
import plotly.express as px

# Create a pie chart for the 'Subgroup' column
fig = px.pie(df,
             names='Group',  # Column for labels in the pie chart
             title="Distribution of Mental Health Care by Group",
             hole=0.4)  # Creates a donut-style pie chart

# Update layout for aesthetics
fig.update_traces(textposition='inside', textinfo='percent+label')

# Show the pie chart
fig.show()


The pie chart effectively illustrates the distribution of mental health care services across different groups, providing a clear visual representation of proportions. The donut-style design enhances readability, allowing viewers to easily discern the percentage of each group in relation to the whole. This visualization can help stakeholders identify which groups may require more focus or resources in mental health initiatives. By analyzing the distribution, policymakers can tailor interventions and strategies to address the specific needs of underserved populations.

In [None]:
TIMELINE

In [None]:
import pandas as pd
import plotly.express as px


# Ensure 'Value' is a column in the DataFrame
print(df.columns)  # Check if 'Value' is present in the DataFrame columns

# Plot with animation
fig = px.bar(df,
             x="State",
             y="Value",
             color="State",
             animation_frame="Time Period Start Date",  # Adjusted to match the actual column name
             animation_group="Subgroup",
             range_y=[data['Value'].min(), data['Value'].max()])  # Ensure 'Value' is a DataFrame column

fig.update_layout(
    title="Mental Health Care Usage Over Time by State and Subgroup",
    xaxis_title="State",
    yaxis_title="Usage"
)

# Show plot
fig.show()


Index(['Indicator', 'Group', 'State', 'Subgroup', 'Phase', 'Time Period',
       'Time Period Label', 'Time Period Start Date', 'Time Period End Date',
       'Value', 'LowCI', 'HighCI', 'Confidence Interval', 'Quartile Range',
       'Suppression Flag', 'Color'],
      dtype='object')


The animated bar chart visualizes the change in mental health care usage across different states and subgroups over time. It effectively demonstrates how usage trends vary by state and subgroup, allowing for dynamic analysis of patterns across time periods. Higher or lower usage values can reveal which states or subgroups are utilizing mental health services more or less over time. This animation helps stakeholders identify shifts in mental health care access or demand within specific regions or groups.

SCATTER PLOT

In [None]:
# Import necessary libraries
import pandas as pd
import plotly.express as px

# Check if 'State' and 'LowCI' columns exist
if 'State' in df.columns and 'LowCI' in df.columns:

    # Plot the scatter plot using plotly express
    fig = px.scatter(df, x='State', y='LowCI', title='Scatter Plot of State vs Low CI',
                     labels={'State': 'State', 'LowCI': 'Low CI'},
                     color='LowCI',  # Optional: color the points based on Low CI values
                     template='plotly_white')

    # Show the plot
    fig.show()

else:
    print("Ensure your data has 'State' and 'LowCI' columns.")


The scatter plot of `State` versus `LowCI` highlights the distribution and variability of mental health care metrics across different states. Outliers with significantly low or high LowCI values may indicate areas needing targeted interventions or successful initiatives. A lack of discernible trends suggests that factors beyond the state may influence these metrics. Overall, this visualization informs policymakers about regional disparities, guiding resource allocation and strategic planning for mental health care improvements.

HIGHLIGHTED BAR CHART

In [None]:
# Import necessary libraries
import pandas as pd
import plotly.express as px



# Ensure that 'Time Period' and 'Value' columns are present
if 'Time Period' in df.columns and 'Value' in df.columns:

    # Create a new column to set color based on whether it's the highlighted time period
    highlight_period = "2023"  # Replace with the time period you want to highlight
    df['Color'] = df['Time Period'].apply(lambda x: 'Highlight' if x == highlight_period else 'Normal')

    # Create the bar graph
    fig = px.bar(df, x='Time Period', y='Value', color='Color',
                 color_discrete_map={'Highlight': 'orange', 'Normal': 'lightskyblue'},
                 title='Time Period vs Value',
                 labels={'Time Period': 'Time Period', 'Value': 'Value'},
                 template='plotly_white')

    # Show the plot
    fig.show()

else:
    print("Ensure your data has 'Time Period' and 'Value' columns.")


Inference
The bar graph generated from the dataset visualizes the relationship between Time Period and Value, with a specific emphasis on the year 2023 highlighted in dark orange. This allows for a clear distinction between the highlighted period and the others represented in lightskyblue.

However, if the orange color for 2023 is not visible, it may indicate that the dataset lacks records for that time period. For example, if the dataset only includes values for 2020, 2021, and 2022, the absence of 2023 will result in no corresponding bar, making it impossible to highlight that year.

Additionally, data labels are included above each bar, enhancing readability and allowing for easy comparison of values across different time periods. The graph's dimensions (800x400) ensure a clear presentation, making it suitable for analysis and discussion.

SORTED

STEP LINE GRAPH

In [None]:
import pandas as pd
import plotly.express as px



# Ensure that 'Time Period Label' and 'Confidence Interval' columns are present
if 'Time Period Label' in df.columns and 'Confidence Interval' in df.columns:

    # Create the line graph for Confidence Interval over Time Period Label
    fig = px.line(df,
                  x='Time Period Label',
                  y='Confidence Interval',
                  title='Confidence Interval Over Time Period',
                  labels={'Time Period Label': 'Time Period', 'Confidence Interval': 'Confidence Interval'},
                  markers=True,  # Adds markers at each data point
                  template='plotly_white')

    # Show the plot
    fig.show()

else:
    print("Ensure your data has 'Time Period Label' and 'Confidence Interval' columns.")

The line graph illustrates the Confidence Interval of mental health care usage over various Time Periods.

For example, if the confidence interval shows a decrease from 20% in early 2023 to 10% by mid 2024, this trend suggests improving data accuracy and more consistent access to mental health services. However, if there is a notable spike to 25% in confidence interval during specific months, it may indicate periods of instability in mental health service access or reporting inconsistencies.

Narrow confidence intervals imply reliable reporting, while wider intervals in recent months could signal uncertainty and emerging challenges in mental health care. Overall, this graph highlights the importance of data reliability in understanding trends and making informed decisions in mental health services.

HEATMAP

In [None]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff


# Select relevant numeric columns for correlation
numeric_columns = ['Value', 'LowCI', 'HighCI']

# Ensure the selected columns are present in the dataset
if all(col in df.columns for col in numeric_columns):

    # Calculate the correlation matrix
    correlation_matrix = df[numeric_columns].corr()

    # Create the heatmap using Plotly
    fig = ff.create_annotated_heatmap(z=correlation_matrix.values,
                                       x=list(correlation_matrix.columns),
                                       y=list(correlation_matrix.index),
                                       colorscale='Viridis',
                                       colorbar=dict(title='Correlation Coefficient'))

    # Update layout for better display
    fig.update_layout(title='Heatmap of Correlation between Numeric Columns',
                      xaxis_title='Variables',
                      yaxis_title='Variables')

    # Show the heatmap
    fig.show()

else:
    print("Ensure your data has the required numeric columns: Value, LowCI, HighCI.")

This heatmap visually summarizes the relationships between numeric variables, highlighting key areas for mental health care strategies. Strong correlations between **LowCI**, **HighCI**, and **Value** suggest that confidence intervals are closely tied to perceived values. Such insights can aid researchers and policymakers in understanding data reliability and stability. Analyzing these correlations helps stakeholders interpret survey results and assess program effectiveness, guiding focus on areas needing attention.

BUBBLE PLOT

In [None]:
import pandas as pd
import plotly.express as px


# Ensure 'Value' and 'Subgroup' columns are present
if 'Value' in df.columns and 'Subgroup' in df.columns:

    # Count occurrences of each subgroup and compute the mean value
    subgroup_summary = df.groupby('Subgroup').agg({'Value': ['mean', 'count']}).reset_index()

    # Flatten the column names
    subgroup_summary.columns = ['Subgroup', 'Mean Value', 'Count']

    # Create a bubble chart
    fig = px.scatter(subgroup_summary,
                     x='Count',
                     y='Mean Value',
                     size='Count',  # Size of the bubbles based on count
                     color='Subgroup',
                     hover_name='Subgroup',
                     title='Bubble Chart of Subgroups',
                     labels={'Count': 'Count of Subgroups', 'Mean Value': 'Mean Value'},
                     template='plotly_white')

    # Show the bubble chart
    fig.show()

else:
    print("Ensure your data has the 'Value' and 'Subgroup' columns.")






The bubble chart effectively visualizes subgroup distributions, with bubble size representing counts and vertical position indicating average values. Larger bubbles highlight prevalent subgroups, while their mean values reveal characteristics or outcomes. Stakeholders can identify areas needing focus, especially if high counts align with lower averages. Insights from this chart inform strategic decisions in mental health initiatives.

LINE PLOT

In [7]:
import pandas as pd
import plotly.express as px



# Define the age groups you're interested in
education_subgroups = [
    "Less than a high school diploma",
    "High school diploma or GED",
    "Some college/Associate's degree",
    "Bachelor's degree or higher",
    "Less than a high school diploma"

]

# Filter the data to include only rows with the specific age groups in 'Subgroup'
education_data = df[df['Subgroup'].isin(education_subgroups)]

# Create a line plot between 'Subgroup' (age) and 'Value'
fig = px.line(education_data,
              x="Subgroup",
              y="Value",
              color="Indicator",  # Color lines based on Indicator
              title="Mental Health Care Values Across Educational subgroups",
              labels={"Subgroup": "By education", "Value": "Value"})

# Show the figure
fig.show()

The line plot effectively illustrates the relationship between educational attainment and mental health care values, providing insights for policy decisions and initiatives. Each line represents a different educational subgroup, allowing for analysis of how educational backgrounds influence mental health care access and utilization. By examining the slopes and intersections, trends can be identified, such as higher values for individuals with advanced degrees indicating better access to services. Conversely, lower values for those with less education highlight potential barriers to mental health care, facilitating comparative analysis among subgroups.