In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Early morning 03.00 - 06.00
# Morning 06.00 - 12.00
# Afternoon 12.00 - 17.00
# Evning 10.00 - 20.00
# Night 20.00 - 03.00 

def convert_timeperiod(time):
    time = int(time)
    hour = int(time / 100)
    minute = (time % 100)//30
    time  =hour + minute

    if   6 <= time <= 11 :
        result = 'Morning'
    elif 11 <= time <= 17 :
        result = 'Afternoon'
    elif 17 <= time <= 20 :
        result = 'Evening'
    elif 20 <= time <= 24 :
        result = 'Night'
    elif time <= 3 :
        result = 'Night'     
    else:
        result= 'EarlyMorning'  
    return result



In [None]:
def convert_preprocess(data,process=None,mode=None):
    if mode:
        process = process.fit(data)
        con_data = process.transform(data)
    else:
        con_data = process.transform(data)
    return con_data,process

In [None]:
def eveulatemodel(y_true,predict):
    score =  mean_squared_error(y_true,predict)
    r2score =  r2_score(y_true,predict)
    print(f'rmse score {score}')
    print(f'r2 score {r2score}')

    plt.scatter(predict,y_true,c='red')

    p1 = max(max(predict), max(y_true))
    p2 = min(min(predict), min(y_true))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()
    return score,r2_score
    

In [None]:
def preparedata(data):
    data['CRSDepTime'] = data.CRSDepTimeHour.astype('string')+ data.CRSDepTimeMinute.astype('string').str.zfill(2)
    data['WheelsOff'] = data.WheelsOffHour.astype('string')+ data.WheelsOffMinute.astype('string').str.zfill(2)
    data['CRSArrTime'] = data.CRSArrTimeHour.astype('string')+ data.CRSArrTimeMinute.astype('string').str.zfill(2)
    data['origin_state'] = data['OriginCityName'].apply(lambda x: x[x.rfind(' ')+1:])
    data['dest_state'] = data['DestCityName'].apply(lambda x: x[x.rfind(' ')+1:])
    data['state_combine'] = data['origin_state'] +'_' + data['dest_state']
    data['location_yn'] = (data['origin_state'] == data['dest_state']).astype('int')
    data.Holidays = (data.Holidays).astype('int')

    data['CRSDepTimeHourDis'] = data.CRSDepTime.apply(lambda x: convert_timeperiod(x))
    data['WheelsOffHourDis'] = data.WheelsOff.apply(lambda x: convert_timeperiod(x))
    data['CRSArrTimeHourDis'] = data.CRSArrTime.apply(lambda x: convert_timeperiod(x))

    top_state = ['CA_CA', 'TX_TX', 'NY_FL', 'FL_NY', 'CA_TX', 'TX_CA', 'FL_TX', 'HI_HI', 'TX_FL', 'FL_GA', 'CA_NV', 'NV_CA', 'GA_FL', 'CA_WA', 'CA_AZ', 'AZ_CA', 'WA_CA', 'CA_CO', 'CO_CA', 'FL_NC', 'NC_FL', 'TX_CO', 'CO_TX', 'NJ_FL', 'FL_NJ', 'IL_FL', 'FL_IL', 'CA_HI', 'FL_FL', 'HI_CA', 'CO_CO', 'NC_NC', 'NY_NC', 'NY_IL', 'IL_NY', 'NC_NY', 'IL_TX', 'TX_IL', 'WA_WA', 'PA_FL', 'FL_PA', 'LA_TX', 'OR_CA', 'TX_LA', 'CA_OR', 'CA_UT', 'TX_GA', 'DC_FL', 'UT_CA', 'GA_TX']

    cat_col = ['Marketing_Airline_Network', 'DayofWeek','Holidays', 'CRSDepTimeHourDis', 'WheelsOffHourDis','CRSArrTimeHourDis', 'state_combine', 'location_yn']
    int_col = [ 'DepDelay', 'TaxiOut', 'CRSElapsedTime','ActualElapsedTime', 'Distance', 'WeatherDelay']
    data = data[cat_col + int_col]
    data['state_combine'] = data['state_combine'].apply(lambda x: x if x in top_state else 'Other')
    data[cat_col] = data[cat_col].astype('category')
    data[int_col] = data[int_col].astype('int')
    return data

In [None]:
def preparedata_prod(data):
    data['CRSDepTime'] = data.CRSDepTimeHour.astype('string')+ data.CRSDepTimeMinute.astype('string').str.zfill(2)
    data['WheelsOff'] = data.WheelsOffHour.astype('string')+ data.WheelsOffMinute.astype('string').str.zfill(2)
    data['CRSArrTime'] = data.CRSArrTimeHour.astype('string')+ data.CRSArrTimeMinute.astype('string').str.zfill(2)
    data['origin_state'] = data['OriginCityName'].apply(lambda x: x[x.rfind(' ')+1:])
    data['dest_state'] = data['DestCityName'].apply(lambda x: x[x.rfind(' ')+1:])
    data['state_combine'] = data['origin_state'] +'_' + data['dest_state']
    data['location_yn'] = (data['origin_state'] == data['dest_state']).astype('int')
    data.Holidays = (data.Holidays).astype('int')

    data['CRSDepTimeHourDis'] = data.CRSDepTime.apply(lambda x: convert_timeperiod(x))
    data['WheelsOffHourDis'] = data.WheelsOff.apply(lambda x: convert_timeperiod(x))
    data['CRSArrTimeHourDis'] = data.CRSArrTime.apply(lambda x: convert_timeperiod(x))

    top_state = ['CA_CA', 'TX_TX', 'NY_FL', 'FL_NY', 'CA_TX', 'TX_CA', 'FL_TX', 'HI_HI', 'TX_FL', 'FL_GA', 'CA_NV', 'NV_CA', 'GA_FL', 'CA_WA', 'CA_AZ', 'AZ_CA', 'WA_CA', 'CA_CO', 'CO_CA', 'FL_NC', 'NC_FL', 'TX_CO', 'CO_TX', 'NJ_FL', 'FL_NJ', 'IL_FL', 'FL_IL', 'CA_HI', 'FL_FL', 'HI_CA', 'CO_CO', 'NC_NC', 'NY_NC', 'NY_IL', 'IL_NY', 'NC_NY', 'IL_TX', 'TX_IL', 'WA_WA', 'PA_FL', 'FL_PA', 'LA_TX', 'OR_CA', 'TX_LA', 'CA_OR', 'CA_UT', 'TX_GA', 'DC_FL', 'UT_CA', 'GA_TX']

    # cat_col = ['Marketing_Airline_Network', 'DayofWeek','Holidays', 'CRSDepTimeHourDis', 'WheelsOffHourDis','CRSArrTimeHourDis', 'state_combine', 'location_yn']
    # int_col = [ 'DepDelay', 'TaxiOut', 'CRSElapsedTime', 'Distance', 'WeatherDelay']
    # data = data[cat_col + int_col]
    data['state_combine'] = data['state_combine'].apply(lambda x: x if x in top_state else 'Other')
    # data[cat_col] = data[cat_col].astype('category')
    # data[int_col] = data[int_col].astype('int')
    return data

In [None]:
df = pd.read_parquet('features_added.parquet')
df_par= df.query("Year >=2022")
df_par,state = preparedata(df_par)

In [None]:
df.__len__()

In [None]:
import uuid 

In [None]:
used_col = [ 'FlightDate', 'CRSElapsedTime', 'Distance'
            ,'Marketing_Airline_Network', 'DayofWeek','Holidays'
            ,'CRSDepTimeHour','CRSDepTimeMinute' ,'CRSArrTimeHour','CRSArrTimeMinute'
            ,'DestCityName','OriginCityName'] 

In [None]:
sample_record = 3000
df_sendsamle = df[used_col].sample(sample_record).sort_values('FlightDate').reset_index(drop=True)
df_sendsamle['FlightID'] = [str(uuid.uuid4()) for i in range(sample_record) ]
df_sendsamle[['DepDelay', 'TaxiOut','WeatherDelay']] = 0.0
df_sendsamle[['WheelsOffHour','WheelsOffMinute']] = df_sendsamle[['CRSDepTimeHour', 'CRSDepTimeMinute']]

In [None]:
import random

new_data = []

num_row = 0

for index,record in df_sendsamle.iterrows():    
    num_row = 0
    for row in range(5):
        
        if record['WeatherDelay']:
            depdelay_weather = [0,random.randint(0,30) ][random.randint(0,1)]
            record['WeatherDelay'] += depdelay_weather

        else:
            depdelay_taxi = [0,random.randint(0,30) ][random.randint(0,1)]
            if record['TaxiOut']:
               
                if depdelay_taxi:
                    record['TaxiOut'] += random.randint(0,30) 
                else:
                    record['WeatherDelay'] += random.randint(1,30)     
            else:
                depdelay_pos = random.randint(1,30*(1+num_row))
                depdelay_neg = random.randint(-30*(1+num_row),-1)
                depdelay_ran = [depdelay_neg,0,depdelay_pos][random.randint(0,2)]

                if record['DepDelay']:                  

                    if depdelay_ran <=0:
                        # record['DepDelay'] += depdelay_ran
                        record['TaxiOut'] += random.randint(1,30) 
                    else :
                        record['DepDelay'] += depdelay_ran

                else:                    
                    record['DepDelay'] += depdelay_ran
        num_row+=1
        _record = record.copy()
        new_data.append(_record)
df_new = pd.DataFrame(new_data)

df_new['new_taxi'] = pd.to_datetime((df_new.WheelsOffHour.astype('string') + df_new.WheelsOffMinute.astype('string').str.zfill(2)),format='%H%M') +  pd.to_timedelta(df_new.DepDelay + df_new.TaxiOut, unit='m')
df_new['WheelsOffHour'] = df_new['new_taxi'].dt.hour
df_new['WheelsOffMinute'] = df_new['new_taxi'].dt.minute
df_new.drop('new_taxi',axis=1,inplace=True)

df_all_sample= pd.concat([df_new,df_sendsamle],axis=0)
df_all_sample = df_all_sample.sort_values(['FlightID','DepDelay','TaxiOut','WeatherDelay']).reset_index(drop=True)



In [548]:
df_all_sample= pd.concat([df_new,df_sendsamle],axis=0)
df_all_sample = df_all_sample.sort_values(['FlightID','DepDelay','TaxiOut','WeatherDelay']).reset_index(drop=True)

In [549]:
df_all_sample.to_csv('All_2023-10-23.csv',index=False)

In [None]:
df_sendsamle_json = df_all_sample.to_dict(orient='records')

In [None]:
preparedata_prod(df_sendsamle_json[:5])

In [None]:
import copy
def preparedata_prod(data):
    top_state = ['CA_CA', 'TX_TX', 'NY_FL', 'FL_NY', 'CA_TX', 'TX_CA', 'FL_TX', 'HI_HI', 'TX_FL', 'FL_GA', 'CA_NV', 'NV_CA', 'GA_FL', 'CA_WA', 'CA_AZ', 'AZ_CA', 'WA_CA', 'CA_CO', 'CO_CA', 'FL_NC', 'NC_FL', 'TX_CO', 'CO_TX', 'NJ_FL', 'FL_NJ', 'IL_FL', 'FL_IL', 'CA_HI', 'FL_FL', 'HI_CA', 'CO_CO', 'NC_NC', 'NY_NC', 'NY_IL', 'IL_NY', 'NC_NY', 'IL_TX', 'TX_IL', 'WA_WA', 'PA_FL', 'FL_PA', 'LA_TX', 'OR_CA', 'TX_LA', 'CA_OR', 'CA_UT', 'TX_GA', 'DC_FL', 'UT_CA', 'GA_TX']
    used_val =['FlightDate','CRSDepTimeHour','CRSDepTimeMinute','CRSArrTimeHour','CRSArrTimeMinute','DestCityName','OriginCityName','FlightID','WheelsOffHour','WheelsOffMinute','CRSDepTime','WheelsOff','CRSArrTime','dest_state','origin_state']

    data['CRSDepTime'] = str(data['CRSDepTimeHour']) + str(data['CRSDepTimeMinute']).zfill(2)
    data['WheelsOff'] = str(data['WheelsOffHour']) + str(data['WheelsOffMinute']).zfill(2) 
    data['CRSArrTime'] = str(data['CRSArrTimeHour']) + str(data['CRSArrTimeMinute']).zfill(2)
    data['origin_state'] = data['OriginCityName'][data['OriginCityName'].rfind(' ')+1:]
    data['dest_state'] = data['DestCityName'][data['DestCityName'].rfind(' ')+1:]
    data['state_combine'] = data['origin_state'] +'_' + data['dest_state']
    data['location_yn'] = int(data['origin_state'] == data['dest_state'])
    data['Holidays'] = int(data['Holidays'])
    data['CRSDepTimeHourDis'] = convert_timeperiod(data['CRSDepTime'])
    data['WheelsOffHourDis'] = convert_timeperiod(data['WheelsOff'])
    data['CRSArrTimeHourDis'] = convert_timeperiod(data['CRSArrTime'])
    data['state_combine'] = data['state_combine']  if data['state_combine'] in top_state else 'Other'
    for i in used_val:
        del data[i]

    return data

In [None]:
process_data = [preparedata_prod(data) for data in copy.deepcopy(df_sendsamle_json[50:60])] 

In [None]:
df_sendsamle_json[50:60]

In [None]:

_remain,train = train_test_split(df_par,test_size=0.6,random_state=1)
test,val = train_test_split(_remain,test_size=0.5,random_state=1)

train_x = train.drop('ActualElapsedTime',axis=1).to_dict(orient='records')
train_y = train['ActualElapsedTime'].values
val_x = val.drop('ActualElapsedTime',axis=1).to_dict(orient='records')
val_y = val['ActualElapsedTime'].values

dv= DictVectorizer()
train_x_dv,dv = convert_preprocess(train_x,dv,'train')
val_x_dv,dv = convert_preprocess(val_x,dv)




In [None]:
data = [{'FlightDate': ('2022-02-11 00:00:00'),
  'CRSElapsedTime': 250.0,
  'Distance': 1440.0,
  'Marketing_Airline_Network': 'WN',
  'DayofWeek': 4,
  'Holidays': False,
  'CRSDepTimeHour': 8,
  'CRSDepTimeMinute': 50,
  'CRSArrTimeHour': 12,
  'CRSArrTimeMinute': 0,
  'DestCityName': 'Tucson, AZ',
  'OriginCityName': 'Chicago, IL',
  'FlightID': '000084da-fd98-4220-b92c-3eb2e9620aa0',
  'DepDelay': 39.0,
  'TaxiOut': 0.0,
  'WeatherDelay': 0.0,
  'WheelsOffHour': 9,
  'WheelsOffMinute': 29},
{'FlightDate': ('2022-02-11 00:00:00'),
  'CRSElapsedTime': 250.0,
  'Distance': 1440.0,
  'Marketing_Airline_Network': 'WN',
  'DayofWeek': 4,
  'Holidays': False,
  'CRSDepTimeHour': 8,
  'CRSDepTimeMinute': 50,
  'CRSArrTimeHour': 12,
  'CRSArrTimeMinute': 0,
  'DestCityName': 'Tucson, AZ',
  'OriginCityName': 'Chicago, IL',
  'FlightID': '000084da-fd98-4220-b92c-3eb2e9620aa0',
  'DepDelay': 39.0,
  'TaxiOut': 0.0,
  'WeatherDelay': 0.0,
  'WheelsOffHour': 9,
  'WheelsOffMinute': 29}]

new_value = [20, 50]

data_with_predict = [{**d, 'predict': new_value ,'test':2} for d in data]
model.__hash__()

In [None]:
from sklearn.pipeline import Pipeline

In [None]:

test,train = train_test_split(df_par,test_size=0.6,random_state=1)

train_x = train.drop('ActualElapsedTime',axis=1).to_dict(orient='records')
train_y = train['ActualElapsedTime'].values
test_x = test.drop('ActualElapsedTime',axis=1).to_dict(orient='records')
test_y = test['ActualElapsedTime'].values

dv= DictVectorizer()
train_x_dv,dv = convert_preprocess(train_x,dv,'train')
test_x_dv,dv = convert_preprocess(test_x,dv)

In [None]:
# none
model = LinearRegression()
model.fit(train_x_dv,train_y)
predict = model.predict(val_x_dv)
eveulatemodel(val_y,predict)

In [None]:
dv = DictVectorizer()
dfs = dv.fit_transform(train_x)

In [None]:
model.fit(dfs,train_y)

In [None]:
train_y

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipeline = make_pipeline(
    DictVectorizer(),
     LinearRegression()
    )
pipeline.fit(train_x,test_x)

In [None]:
pipe.predict(test_x)

In [None]:
# none
model = LinearRegression()
model.fit(train_x_dv,train_y)
predict = model.predict(test_x_dv)
eveulatemodel(test_y,predict)

In [None]:
test['predict'] = predict
test['dif'] = (test['predict'] - test['ActualElapsedTime']).astype('int')

In [None]:
thread = 25
(len(test.query("dif >@thread or dif < -@thread")) / len(test.query("dif <@thread and dif > -@thread")))*100

In [24]:
import pickle

# pickle.dump(model, open('model.sav', 'wb'))

In [28]:
model = pickle.load(open('../../artifact/model.sav','rb'))

In [None]:
pickle.dump(dv, open('dv.sav', 'wb'))

In [2]:
import pandas as pd
data = pd.read_csv('../../data/All_2023-10-23.csv')

In [21]:
data2 = data.iloc[:10000].to_json(orient='records')

In [22]:
with open('../../data/samplerecord-smallest.json','w+') as file:
    file.write(data2)

In [11]:
import json
feil = open('../../data/samplerecord-small.json','r')  
file = json.load(feil)

In [18]:
file[0]

{'FlightDate': '2021-08-09 00:00:00',
 'CRSElapsedTime': 135.0,
 'Distance': 825.0,
 'Marketing_Airline_Network': 'WN',
 'DayofWeek': 0,
 'Holidays': False,
 'CRSDepTimeHour': 6,
 'CRSDepTimeMinute': 0,
 'CRSArrTimeHour': 8,
 'CRSArrTimeMinute': 15,
 'DestCityName': 'Chicago, IL',
 'OriginCityName': 'New Orleans, LA',
 'FlightID': '000011fa-f374-4784-ae11-398ed8900ac1',
 'DepDelay': -29.0,
 'TaxiOut': 0.0,
 'WeatherDelay': 0.0,
 'WheelsOffHour': 5,
 'WheelsOffMinute': 31}