[Reference](https://blog.devgenius.io/time-series-forecasting-passenger-air-traffic-time-series-project-part-1-da51d4e8520a)

prepare_data.py :

In [1]:
#inside prepare_data.py
#import required packages
from typing import Type
import pandas as pd 
import os 


class DataWrangler : 
    """
    custom class for datawrangling the original passenger airline dataset 
    functionality : 
    1. Loading the original  dataset 
    2. Preparing EDA data 
    3. Preparing forecast data
    """
    
    def load_data(self) : 
        #reading as pandas dataframe
        filepath = 'Air_Traffic_Passenger_Statistics.csv'
        data = pd.read_csv(filepath)

        return data 
    
    def prepare_eda_data(self,dataframe : pd.DataFrame) : 
        """
        Class method for preparing EDA data, wrangling steps : converting str to pd.datetime, dropping duplicates, 
        changing value for GEO Region 
        Args:
            dataframe (pd.DataFrame): Air_Traffic_Passenger_Statistics.csv path : data/original_data/Air_Traffic_Passenger_Statistics.csv
        Returns:
            data[pd.Dataframe]: Return Ready for EDA dataframe
        """
        #deep copying the dataframe to avoid changes in original data
        data = dataframe.copy(deep=True)
        #replacing airline value
        data = data.replace('United Airlines - Pre 07/01/2013', 'United Airlines')
        #changing period format from string to datetime
        data['Period'] = data['Activity Period'].astype('string')
        data['Period'] = pd.to_datetime(data['Period'], format='%Y%m')
        #dropping duplicates
        data = data.drop_duplicates(keep='first')
        #dropping Activity Period columns
        data = data.drop(columns=['Activity Period'])
        #replacing value of each GEO Region values for aggregation purpose 
        data['GEO Region'] = data['GEO Region'].replace('Canada', 'North America')
        data['GEO Region'] = data['GEO Region'].replace('US', 'North America')
        data['GEO Region'] = data['GEO Region'].replace('Australia / Oceania', 'Australia')
        data['GEO Region'] = data['GEO Region'].replace('Middle East', 'Asia')
        data['GEO Region'] = data['GEO Region'].replace('Central America', 'South America')
        data['GEO Region'] = data['GEO Region'].replace('Mexico', 'South America')
        return data
    
    def prepare_forecast_data(self,dataframe) : 
        """
        Args:
            dataframe (pd.DataFrame): Air_Traffic_Passenger_Statistics.csv path : data/original_data/Air_Traffic_Passenger_Statistics.csv
        Returns:
            data[pd.Dataframe]: Return Ready for EDA dataframe
        """
        #deep copying the dataframe to avoid changes in original data
        data = dataframe.copy(deep=True)
        #since the passenger count is not aggregated yet we need to compile it using pd.droupby
        data = data.groupby(['Period']).agg(**{'Total Passenger': ('Passenger Count', 'sum')}).reset_index()
        return data

generate_modified_data.py :


In [3]:
#inside generate_modified_data.py
#importing all required packages 
import prepare_data #relative package to python file : prepare_data.py which contain our custom datawrangling tool 
from contextlib import contextmanager
#importing tqdm for progress monitoring purpose
from tqdm import tqdm 

#creating contextmanager tool to changedir whenever reading our original dataset 
@contextmanager
def change_path(path) : 
    import os
    #saving prev cwd before switch 
    prev_cwd = os.getcwd()
    #changing to upper dir (data)
    os.chdir('..')
    #changing path 
    os.chdir(path)
    try : 
        yield
    finally : 
        #turning back to prev cwd path
        os.chdir(prev_cwd) 
        
    
#creating a function to wrap datawrangling step followed by saving into two dataset : 1. EDA Dataset 2. Forecast Dataset
def begin_process() : 
    #instanciating the DataWrangler class
    pbar = tqdm(total=100)
    wrangler = prepare_data.DataWrangler()
    #using contextmanager to change path temporary and read the .csv file
    with change_path('original_data') : 
        data = wrangler.load_data()
    #prepare eda data
    eda_data = wrangler.prepare_eda_data(data)
    #update progress 
    pbar.update(n=50)
    #save to ready_to_process_data folder
    with change_path('ready_to_process_data') : 
        eda_data.to_csv('prepare_for_eda.csv',index=False)
    print(eda_data.head(4))




    

    # the output should contain two columns : Activity Period and Passengger Total 
    forecast_data = wrangler.prepare_forecast_data(eda_data)
    print(forecast_data.head(4))
    #save to ready_to_process_data folder
    with change_path('ready_to_process_data') : 
        forecast_data.to_csv('forecast_data.csv',index=False)
    #update progress    
    pbar.update(n=50)
    print('\n Successfully created prepare_for_eda.csv and forecast_data.csv')
    
    
if __name__ == '__main__' : 
    begin_process()