In [None]:
'''
Welcome to our Notebook!

This is for the Urban Air Quality Group

Group Members: Selma, Ethan, Sophiya
'''

In [None]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

In [None]:
# Read the CSV file
df = pd.read_csv('IHME_GBD_2021_AIR_POLLUTION_1990_2021_PM_Y2022M01D31.csv')

df.head()

In [None]:
print(df.columns)

In [None]:
# graph boxplot
# Create a boxplot
plt.figure(figsize=(14, 7))
sns.boxplot(data=df[['mean', 'lower', 'upper']])
plt.title('Boxplot of Pollution Data')
plt.ylabel('Pollution Levels')
plt.xlabel('Metrics (Mean, Lower, Upper)')
plt.show()


In [None]:
# Filter the data to include only rows between 1990 and 2021
filtered_df = df[(df['year_id'] >= 1997) & (df['year_id'] <= 2021)]

# Group by 'Year' and calculate the mean and median for the 'Value' column
stats = filtered_df.groupby('year_id')['mean'].agg(['mean', 'median']).reset_index()

# Plot the mean and median
plt.figure(figsize=(12, 6))
plt.plot(stats['year_id'], stats['mean'], label='Mean', marker='o')
plt.plot(stats['year_id'], stats['median'], label='Median', marker='s')
plt.title('PM2.5, Micrograms Per Cubic Meter (1990-2021) Globally')
plt.xlabel('Year')
plt.ylabel('PM2.5, Micrograms per Cubic Meter')
plt.legend()
plt.grid(True)
plt.show()

In [None]:


# Filter the data for years 1990–2021 and location_name == 'United States of America'
filtered_df = df[(df['year_id'] >= 1997) & (df['year_id'] <= 2021) & (df['location_name'] == 'United States of America')]

# Group by 'year_id' and calculate mean and median
filtered_PM_data = filtered_df.groupby('year_id')[['mean', 'median']].mean().reset_index()

# Plot the mean and median
plt.figure(figsize=(12, 6))
plt.plot(stats['year_id'], stats['mean'], label='Mean', marker='o', linestyle='-')
plt.plot(stats['year_id'], stats['median'], label='Median', marker='s', linestyle='--')
plt.title('Mean and Median Values in the USA (1990-2021)')
plt.xlabel('Year')
plt.ylabel('PM2.5, Micrograms per Cubic Meter')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Load your dataset into a pandas DataFrame
data = pd.read_csv("asthma_in_children.csv")

# Filter the dataset for a specific group, such as "Total" or a specific age group (optional)
asthma_data_filtered = data[(data['STUB_LABEL'] == 'Younger than 18 years') & 
                     (data['PANEL'] == 'Asthma attack in past 12 months') &
                     (data['UNIT'] == "Percent of children, crude")]


# Create a scatterplot
plt.figure(figsize=(10, 6))
plt.scatter(asthma_data_filtered['YEAR'], asthma_data_filtered['ESTIMATE'], color='blue', label='Asthma Percentage')

# Add titles and labels
plt.title('Percentage of Children with Asthma attacks in past 12 months Over Time', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Asthma Percentage (%)', fontsize=12)

# Show the plot
plt.grid(True)
plt.show()

In [None]:
filtered_PM_data = filtered_PM_data.rename(columns={'year_id': 'YEAR'})

print(filtered_PM_data.columns)
#print(asthma_data_filtered.columns)
# Merge both datasets on the 'year'
merged_data = pd.merge(asthma_data_filtered, filtered_PM_data, on='YEAR')

# Select features and target
X = merged_data[['mean']]  # Particulate levels
y = merged_data['ESTIMATE']  # Asthma attack percentage

# Fit a linear regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")

# Visualize the data and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, y_pred, color='red', label='Regression Line')

# Add titles and labels
plt.title('Relationship Between Particulate Matter and Asthma Attacks', fontsize=14)
plt.xlabel('Particulate Matter (µg/m³)', fontsize=12)
plt.ylabel('Asthma Attack Percentage', fontsize=12)
plt.legend()

# Show the plot
plt.grid(True)
plt.show()


In [None]:
# Select features and target
X = merged_data[['mean']]  # Particulate levels
y = merged_data['ESTIMATE']  # Asthma attack percentage

# Transform features to include polynomial terms
poly = PolynomialFeatures(degree=3)  # Degree 2 for quadratic polynomial fit
X_poly = poly.fit_transform(X)

# Fit a linear regression model to the polynomial features
model = LinearRegression()
model.fit(X_poly, y)

# Make predictions
y_pred = model.predict(X_poly)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")

# Visualize the data and the polynomial fit
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, y_pred, color='red', label='Polynomial Fit')

# Add titles and labels
plt.title('Relationship Between Particulate Matter and Asthma Attacks (Polynomial Fit)', fontsize=14)
plt.xlabel('Particulate Matter (µg/m³)', fontsize=12)
plt.ylabel('Asthma Attack Percentage', fontsize=12)
plt.legend()

# Show the plot
plt.grid(True)
plt.show()