# Function Codes

In [1]:
def dataCleaning(df, code=True, tips=False, orientation=True, formatIssues=True, missingValues=True, duplicateValues=True, outliers=True):
    """
    df: your dataframe

    code: A text template to note your observations as you go. Use the code snippets included in the output. copy-paste into vscode/notepad

    tips: Provides snippets of code to help you clean potential issues in your df. If you prefer this to code
    
    orientation: Provides information about the shape/objects of your data
    
    formatIssues: Provides detailed information on each column to help identify format issues
    
    missingValues: Provides information on missing values
    
    duplicateValues: Provides information on duplicate values
    
    outliers: Provides information on outliers
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    if code==True:
        print("### CLEANING CODE:")
        print("df = dfX #Change to your df's name")
        print()
        print("#### Change column value:")
        print()
        print()
        print("#### Drop entire column:")
        print()
        print()
        print("#### Change column type:")
        print()
        print()
        print("#### Change column name:")
        print()
        print()
        print("#### Handle missing values:")
        print()
        print()
        print("#### Handle duplicate values:")
        print("# df.drop_duplicates(inplace=True) # drop ALL duplicate rows")
        print()
        print("#### Drop outliers:")
        print()
        print()
        print("#### Other observations / further investigations:")
        print("#")
        print("#")
        print("#")
        print()
        print("df.head() #Final Review")
        print("# dfX = df #Change to your df's name")
        print()
        print("=========================================")
    
    if orientation==True:
        print("ORIENTATION")
        print(df.info())
        print("=========================================")
        print()
        
    
    if formatIssues==True:
        print("FORMAT ISSUES")
        print()
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'datetime64':
            #if df[col].dtype == 'float64':

                print("df.rename(columns={'" + col + "': ''}, inplace=True)", "#rename column")
                print("df['" + col + "'] = df['" + col + "'].replace('old_value', 'new_value')")
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print("df.drop('" + col + "', axis=1, inplace=True)")                
                pd.set_option('display.max_rows', None)
                print(df.groupby(col, sort=True).size())
                pd.reset_option('display.max_rows')
                #display the dtypes of the column
                print("Current Column DType: ", df[col].dtype, "     Do not compare with above. This one will always return int64 as it's the dtype of the count")                
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print()
            #else:
            #    print(col)
            #    print(df[col].describe())
            #    print()

        if tips==True:
            print("TIPS")
            print("To make a correction to a column, use the following syntax:")
            print("df['A'] = df['A'].apply(lambda x: x.replace('old_value', 'new_value'))")
            print()
            print("To change a column data type, use the following syntax:")
            print("df['A'] = pd.to_datetime(df['A']) # for datetime")
            print("df['A'] = df['A'].astype('int64') # for integers")
            print("df['A'] = df['A'].astype('float64') # for floats")
            print("df['A'] = df['A'].astype('category') # for categorical")
            print("df['A'] = df['A'].astype('object') # for object")
            print()
        print("=========================================")
        print()

    if missingValues==True:
        print("MISSING VALUES")
        print()
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                print(col, ":", df[col].isnull().sum(), " missing values")
                print("df.dropna(subset=['" + col + "'], inplace=True)")
                print("df['" + col + "'].fillna(df['" + col + "'].mean(), inplace=True) #fill NA entries with the mean")
                print("df['" + col + "'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
                print()
                print(df.loc[df[col].isnull()].head())
                print()
            else:
                print(col, ": No missing values")
                print()
                                    
        if tips==True:
            print()
            print("TIPS")
            print("You can drop rows with missing values using one of the following code:")
            print("df.dropna(subset=['col'], inplace=True) #For a single column")
            print("df.dropna(inplace=True) #For all columns")
            print()
            print("You can fill rows with missing values using one of the following code:")
            print("df['col'].fillna(df['col'].mean(), inplace=True) #fill NA entries with the mean")
            print("df['col'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
            print("df['col'].fillna(method='ffill') # forward-fill to propagate the previous value forward")
            print("df['col'].fillna(method='bfill' # back-fill to propagate the next values backward)")
            print()
            print("To view them:")
            print("df.loc[df[col].isnull()].head()")
            print()
        print("=========================================")
        print()

    if duplicateValues==True:
        print("DUPLICATE VALUES")
        print()
        print(df[df.duplicated()].head())
        print()

        if tips==True:
            print("TIPS")
            print("You can drop duplicate rows using the following code:")
            print("df.drop_duplicates(inplace=True)")
            print("df.drop_duplicates(subset=['col'], inplace=True) #For a single column")
            print()
            print("To view them:")
            print("df[df.duplicated()].head()")
            print()
    
        print("=========================================")
        print()

    if outliers==True:
        print("OUTLIERS")
        print()
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                print(col)
                print("-----")
                print("Outlier(s):")
                print("Below ", df[col].mean() - 3*df[col].std(), " -> ", df[df[col] < df[col].mean() - 3*df[col].std()].shape[0], " low outlier(s)")
                print("Above ", df[col].mean() + 3*df[col].std(), " -> ", df[df[col] > df[col].mean() + 3*df[col].std()].shape[0], " high outlier(s)")
                low = df[col].mean() - 3*df[col].std()
                high = df[col].mean() + 3*df[col].std()
                print("df = df[(df['" + col + "'] > " + str(low) + ") & (df['" + col + "'] < " + str(high) + ")]")
                print()
                print(df[col].describe())
                print()
                print("Boxplot")
                sns.boxplot(df[col])
                plt.show()
                print()
                print("Histogram")
                sns.histplot(df[col])
                plt.show()
                print("=========================================")
                print()

        if tips==True:
            print("TIPS")
            print("You can drop outliers using the following code:")
            print("df = df[(df['column'] > lower_bound) & (df['column'] < upper_bound)]")
            print()

# Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#flights_sample = pd.read_csv("../../data/raw/100K_pull_flights(26Nov).csv")  #This was our first sample which was not pulled randomly. Kept for archive purposes

flights_sample = pd.read_csv("../../data/raw/200K_random_flights(26Nov).csv")

flights_test = pd.read_csv("../../data/raw/FLIGHTS_TEST_RAW.csv")

# Data Preparation - Common to Sample & Test

In this step we'll run the function below to identify formatting issues, columns to drop, outliers, missing values, duplicates, etc. In order to keep this notebook more readable, this step will be commented out once completed. The results are consolidated in a cleaning code, below:

In [4]:
#dataCleaning(flights_sample)

As we'll be running both the test and sample thru this, we've put it into a function:

In [5]:
def flights_clean(flights):
    ### CLEANING CODE:
    df = flights #Change to your df's name

    #### Drop columns:
    df.drop('branded_code_share', axis=1, inplace=True) # Using Unique Carrier Code for analysis
    df.drop('mkt_carrier', axis=1, inplace=True) # Using Unique Carrier Code for analysis
    df.drop('mkt_carrier_fl_num', axis=1, inplace=True) #using op_carrier_fl_num instead
    df.drop('origin_airport_id', axis=1, inplace=True) #working with IATA codes instead
    df.drop('dest_airport_id', axis=1, inplace=True) #working with IATA codes instead
    df.drop('dup', axis=1, inplace=True) # All the same value
    df.drop('flights', axis=1, inplace=True) # All the same value
    df.drop('dest_city_name', axis=1, inplace=True) # Deemed not useful
    df.drop('origin_city_name', axis=1, inplace=True) # All the same value

    #### Change column type:
    df['fl_date'] = df['fl_date'].astype('datetime64')  

    #### Change column value:
    df['crs_dep_time'] = pd.to_datetime(df['crs_dep_time'], unit='m', errors='coerce').dt.strftime("%H:%M")
    df['crs_arr_time'] = pd.to_datetime(df['crs_arr_time'], unit='m', errors='coerce').dt.strftime("%H:%M")

    #### Change column name:
    df.rename(columns={'fl_date': 'Flight Date'}, inplace=True) 
    df.rename(columns={'mkt_unique_carrier': 'Marketer - Unique Carrier Code'}, inplace=True)
    df.rename(columns={'op_unique_carrier': 'Operator - Unique Carrier Code'}, inplace=True)
    df.rename(columns={'op_carrier_fl_num': 'Flight Number'}, inplace=True)
    df.rename(columns={'tail_num': 'Tail Number'}, inplace=True) 
    df.rename(columns={'origin': 'Origin Airport (IATA Code)'}, inplace=True) 
    df.rename(columns={'dest': 'Destination Airport (IATA Code)'}, inplace=True) 
    df.rename(columns={'crs_dep_time': 'Scheduled Departure Time (local time)'}, inplace=True) 
    df.rename(columns={'crs_arr_time': 'Scheduled Arrival Time (local time)'}, inplace=True)
    df.rename(columns={'crs_elapsed_time': 'Scheduled Elapsed Time'}, inplace=True)
    df.rename(columns={'distance': 'Distance (miles)'}, inplace=True)

    # Is op_unique_carrier a duplicate of mkt_unique_carrier? No, we'll keep both and create an add'l column to highlight when they are not the same
    df['Different Marketer & Operator Carrier Code'] = np.where(df['Marketer - Unique Carrier Code'] != df['Operator - Unique Carrier Code'], 1, 0)

    # Create a column with the day/month/year of the flight
    df['Flight Weekday'] = pd.DatetimeIndex(df['Flight Date']).weekday   #0: Monday, 1:Tuesday, etc.
    df['Flight Day'] = pd.DatetimeIndex(df['Flight Date']).day
    df['Flight Month'] = pd.DatetimeIndex(df['Flight Date']).month
    df['Flight Year'] = pd.DatetimeIndex(df['Flight Date']).year
    df.drop('Flight Date', axis=1, inplace=True) # Empty column
    
    # Create hour columns
    df["Scheduled hour of departure"] = df["Scheduled Departure Time (local time)"].str.split(":").str[0].astype(int)
    df["Scheduled hour of arrival"] = df["Scheduled Arrival Time (local time)"].str.split(":").str[0].astype(int)

    #df.head(10) #Final Review
    return df

In [6]:
#Clean both the sample and test dataset
flights_sample = flights_clean(flights_sample)
flights_test = flights_clean(flights_test)

# Export Flights Test to CSV

This is the end of the road for our flights test. 

In [7]:
flights_test.shape

(660556, 17)

In [8]:
flights_test.to_csv('../../data/raw/Cleaned-flights_test.csv',index=False)

We have some additional work to do with the sample one, detailed below

# Additional cleaning steps for sample

The sample has additional columns, although useful informatino, it is not included in the test so we'll remove most of them.

As well, we have the delay information which we'll use at a later step. We'll clean those.

In [9]:
### CLEANING CODE (Sample only):
df = flights_sample #

#### Drop columns
df.drop('dep_time', axis=1, inplace=True)
df.drop('taxi_out', axis=1, inplace=True)
df.drop('taxi_in', axis=1, inplace=True)
df.drop('wheels_off', axis=1, inplace=True)
df.drop('wheels_on', axis=1, inplace=True)
df.drop('arr_time', axis=1, inplace=True)
df.drop('actual_elapsed_time', axis=1, inplace=True)
df.drop('air_time', axis=1, inplace=True)
df.drop('first_dep_time', axis=1, inplace=True) #99125  missing values
df.drop('total_add_gtime', axis=1, inplace=True) #99125  missing values
df.drop('longest_add_gtime', axis=1, inplace=True) #99125  missing values
df.drop('no_name', axis=1, inplace=True)

### Formatting the additional delay columns:
# Departure Delay
df['dep_delay'] = df['dep_delay'].fillna(0)    #Didn't really have an issue with this one, but just in case
df['dep_delay'] = df['dep_delay'].astype('int64')
df.rename(columns={'dep_delay': 'Departure Delay (minutes)'}, inplace=True)

# Arrival Delay
df['arr_delay'] = df['arr_delay'].fillna(0)
df['arr_delay'] = df['arr_delay'].astype('int64')
df.rename(columns={'arr_delay': 'Arrival Delay (minutes)'}, inplace=True)

# carrier_delay
df['carrier_delay'] = df['carrier_delay'].fillna(0)
df['carrier_delay'] = df['carrier_delay'].astype('int64')
df.rename(columns={'carrier_delay': 'Carrier Delay (minutes)'}, inplace=True)

# weather_delay
df['weather_delay'] = df['weather_delay'].fillna(0)
df['weather_delay'] = df['weather_delay'].astype('int64')
df.rename(columns={'weather_delay': 'Weather Delay (minutes)'}, inplace=True)

# nas_delay
df['nas_delay'] = df['nas_delay'].fillna(0)
df['nas_delay'] = df['nas_delay'].astype('int64')
df.rename(columns={'nas_delay': 'National Air System Delay (minutes)'}, inplace=True)

# security_delay
df['security_delay'] = df['security_delay'].fillna(0)
df['security_delay'] = df['security_delay'].astype('int64')
df.rename(columns={'security_delay': 'Security Delay (minutes)'}, inplace=True)

# late_aircraft_delay
df['late_aircraft_delay'] = df['late_aircraft_delay'].fillna(0)
df['late_aircraft_delay'] = df['late_aircraft_delay'].astype('int64')
df.rename(columns={'late_aircraft_delay': 'Late Aircraft Delay (minutes)'}, inplace=True)

#df.head(10) #Final Review
flights_sample = df 

## Below (collapsed) is the first Cleaning Code we used... Only kept for archive purposes...

# Split the sample into a Train and a Test Set

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
flights_sample_train, flights_sample_test = train_test_split(flights_sample, test_size=0.2, random_state=42)

In [12]:
flights_sample_train = flights_sample_train.reset_index().drop('index', axis=1)
#flights_train.head(2)

In [13]:
flights_sample_test = flights_sample_test.reset_index().drop('index', axis=1)
#flights_test.head(2)

# Additional Steps for the training Set

We decided to remove the outliers from the training set to try to increase accuracy. We've also removed the cancelled flights and the diverted flights in the same efforts. As this is not something we can do for the test, we are only applying this step to the training dataset.

In [14]:
### CLEANING CODE (Trg only):
df = flights_sample_train #

# Remove cancelled flights
df = df[df['cancelled'] == 0]

# Remove diverted flights
df = df[df['diverted'] == 0]

# Removing 'Departure Delay (minutes)'s outliers
low = df['Departure Delay (minutes)'].mean() - (3 * df['Departure Delay (minutes)'].std())
high = df['Departure Delay (minutes)'].mean() + (3 * df['Departure Delay (minutes)'].std())
df = df[(df['Departure Delay (minutes)'] > low) & (df['Departure Delay (minutes)'] < high)]

# Removing 'Arrival Delay (minutes)'s outliers
low = df['Arrival Delay (minutes)'].mean() - (3 * df['Arrival Delay (minutes)'].std())
high = df['Arrival Delay (minutes)'].mean() + (3 * df['Arrival Delay (minutes)'].std())
df = df[(df['Arrival Delay (minutes)'] > low) & (df['Arrival Delay (minutes)'] < high)]

flights_sample_train = df 

# Export to CSV

In [15]:
flights_sample_train.to_csv('../../data/raw/Cleaned-flights_sample_training.csv', index=False)
flights_sample_test.to_csv('../../data/raw/Cleaned-flights_sample_testing.csv',index=False)

We'll also store a flights sample+test, but only for testing purposes for the enrichments (ie. making sure we have all the airports and such). This will not be used in later steps.

In [16]:
flights = pd.concat([flights_test, flights_sample], axis=0)
flights = flights[flights_test.columns]

flights.to_csv('../../data/raw/flights_sample+test.csv',index=False)