# tools.ipynb
This module contains tools that are used in whole work.

In [None]:
import pandas as pd

def get_data(file:str = "Air_Quality.csv"):
    '''This function reads the data from the csv file and returns it as a pandas dataframe.'''
    data = pd.read_csv(file)
    return data

def get_clean_data(file:str = "Air_Quality.csv"):
    '''This function reads the data from get_data function, cleans it, and returns it as a pandas dataframe.'''
    data = get_data(file)
    clean_data = data.drop(columns=['Message'])
    clean_data = clean_data.dropna(subset=['Geo Join ID', 'Geo Place Name'])
    clean_data['Start_Date'] = pd.to_datetime(clean_data['Start_Date'])

    return clean_data

In [None]:
data = get_clean_data('Air_Quality.csv')
path = 'Air_Quality.csv'

In [None]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns


def get_top_pollutants(data, top_n=10):
    """
    Get the top N most common air pollutants from the given dataset.

    Parameters:
    data (DataFrame): The input DataFrame containing air pollutant data.
    top_n (int): The number of top pollutants to return. Default is 10.

    Returns:
    DataFrame: A DataFrame containing the top N most common air pollutants.
    """
    pollutant_counts = data.groupby('Name').size().reset_index(name='Count')
    most_common_pollutants = pollutant_counts.sort_values(by='Count', ascending=False)
    return most_common_pollutants.head(top_n)

In [None]:
get_top_pollutants(data=data)

In [None]:
def plot_top_pollutants(data_path, top_n=10):
    """
    Plot the top N most common air pollutants from the given dataset.

    Parameters:
    data_path (str): The file path to the input CSV file containing air pollutant data.
    top_n (int): The number of top pollutants to display in the plot. Default is 10.
    """

    data = get_clean_data(data_path)
    

    most_common_pollutants = get_top_pollutants(data, top_n=top_n)

    most_common_pollutants['ShortName'] = most_common_pollutants['Name'].apply(lambda x: x[:15] + '...' if len(x) > 15 else x)
    

    labels = most_common_pollutants['ShortName'] + ' (' + most_common_pollutants['Name'] + ')'
    

    plt.figure(figsize=(10, 6))
    plt.pie(most_common_pollutants['Count'], labels=labels, autopct='%1.1f%%', colors=plt.cm.Paired(range(len(labels))))
    plt.title(f'Top {top_n} Most Common Air Pollutants')
    plt.show()



In [None]:
plot_top_pollutants(path)