In [249]:
#%%writefile myfunctions.py
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PowerTransformer,StandardScaler
from pickle import load,dump
def cols_types(df:pd.DataFrame):
    df['Month']=[datetime.strftime(df['FlightDate'][i],'%m') for i in df.index]
    df['DayOfWeek']=[datetime.strftime(df['FlightDate'][i],'%u') for i in df.index]
    df['DayofMonth']=[datetime.strftime(df['FlightDate'][i],'%d') for i in df.index]
    df['CRSDepTime']=df['CRSDepTime'].apply(lambda x: convert_to_min(x))
    df['CRSArrTime']=df['CRSArrTime'].apply(lambda x: convert_to_min(x))
    df['Flight_Number_Marketing_Airline']=df['Flight_Number_Marketing_Airline'].astype('object')
    df['Flight_Number_Operating_Airline']=df['Flight_Number_Operating_Airline'].astype('object')
    return df

def convert_to_min(x):
    if x<10:
        string= '00:0'+str(x)
    elif x<60:
        string= '00:'+str(x)
    elif x<960:
        string= '0'+str(x)[0]+':'+str(x)[1:]
    else:
        string=str(x)[0:2]+':'+str(x)[2:]
    dt=datetime.strptime(string,'%H:%M')+timedelta(days=25569)
    return dt.timestamp()


def outcome(df):
    if df['Cancelled']:
        return 'Cancelled'
    if df['ArrDelayMinutes']>0:
        return 'Delayed'
    if df['Diverted']:
        return 'Diverted'
    return 'OK'

def filtered(df):
    thresh=max(df['FlightDate'])
    df=df[thresh<(df['FlightDate']+pd.Timedelta(days=730))]
    return df[~df['Diverted']]

def add_marketing(df):
    companies=df.groupby(['IATA_Code_Operating_Airline','Airline']).agg('count').reset_index()
    return {companies['IATA_Code_Operating_Airline'][i]:companies['Airline'][i] for i in range(len(companies))}

def clean(df):
    if df['FlightDate'].dtype=='object':
        df['FlightDate']=pd.to_datetime(df['FlightDate'])
    df=filtered(df)
    df['result']=df.apply(lambda x: outcome(x), axis=1)
    df=cols_types(df)
    df['DayofMonth']=df['DayofMonth'].astype('str')
    df=df[['Month','DayofMonth','DayOfWeek','Origin', 'Dest','IATA_Code_Marketing_Airline','IATA_Code_Operating_Airline','Flight_Number_Marketing_Airline','Flight_Number_Operating_Airline','ArrDelayMinutes','Distance','CRSDepTime','CRSArrTime','result']] 
    return df

def transformx(df:pd.DataFrame,y1,train:bool=True,x:str=''):
    num=df._get_numeric_data()
    cat=df.selectdtypes('object')
    if train:
        PT=PowerTransformer().fit(num,y1)
        ST=StandardScaler().fit(num,y1)
        with open('../Transformers/PT'+x+'.pkl','wb') as file:
            dump(PT,file)
        with open('../Scalers/ST'+x+'.pkl','wb') as file:
            dump(ST,file)
    else:
        with open('../Transformers/PT'+x+'.pkl','rb') as file:
            PT=load(file)
        with open('../Scalers/ST'+x+'.pkl','rb') as file:
            ST=load(file)
    num=pd.DataFrame(PT.transform(num),columns=num)
    num=pd.DataFrame(ST.transform(num),columns=num)

    return pd.concat([num,cat],axis=1)

def transformy(y1:pdSeries,y2:pd.Series,train:bool=True,x=''):
    if train:
        PTy=PowerTransformer().fit(y2,y1)
        with open('../Transformers/PTy'+x+'.pkl','wb') as file:
            dump(PTy,file)
        


Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',
       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOu

In [247]:
df2=pd.read_csv('../Data/Raw/sample.csv')
df2=clean(df2)
df


<class 'pandas.core.frame.DataFrame'>
Index: 103629 entries, 0 to 103900
Data columns (total 14 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Month                            103629 non-null  object 
 1   DayofMonth                       103629 non-null  object 
 2   DayOfWeek                        103629 non-null  object 
 3   Origin                           103629 non-null  object 
 4   Dest                             103629 non-null  object 
 5   IATA_Code_Marketing_Airline      103629 non-null  object 
 6   IATA_Code_Operating_Airline      103629 non-null  object 
 7   Flight_Number_Marketing_Airline  103629 non-null  object 
 8   Flight_Number_Operating_Airline  103629 non-null  object 
 9   ArrDelayMinutes                  101257 non-null  float64
 10  Distance                         103629 non-null  float64
 11  CRSDepTime                       103629 non-null  float64
 12  CRSArrT

In [None]:
X_train, X_test, Y_train, Y_test