In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
training_data = pd.read_csv('prediction_challenge_train.csv')
air_map = pd.read_csv('airport_country_code_mapping.csv')

print(training_data.head())
print(training_data.info())
print(training_data.columns)
print(air_map.head())

      ID First Name Last Name  Gender  Age  Nationality Airport Country Code  \
0  22554      Tally   Unworth  Female   13    Indonesia                   PG   
1  42928    Bobette   Turfitt  Female   38     Thailand                   CN   
2  26198    Karalee     Gross  Female   75  Philippines                   US   
3  56569    Laurene   Shilton  Female   32       Poland                   ID   
4  65769      Nancy  McGuigan  Female   17      Albania                   MG   

  Departure Date         Pilot Name  Ticket Price Eligible_For_Discount  
0     12/10/2022     Gussie Ridding        1218.0                   Yes  
1     09/06/2022     Ursola Faudrie         688.0                   Yes  
2     03/11/2022      Kellia Bunney         824.0                   Yes  
3      1/19/2022     Kial McCaighey         702.0                   Yes  
4      3/24/2022  Pincas Lorenzetto        1120.0                   Yes  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51462 entries, 0 to 51461

In [3]:
original = pd.read_csv('prediction_challenge_train.csv')
original_no_count = original['Eligible_For_Discount'].value_counts().get('No', 0)
print(original_no_count)

24668


In [4]:
def age_category(age):
    if age <= 13:
        return 'Child'
    elif age >= 86:
        return 'Senior'
    else:
        return 'Adult'

training_data['Age_Category'] = training_data['Age'].apply(age_category)

In [5]:
training_data['Departure Date'] = pd.to_datetime(training_data['Departure Date'], format='%m/%d/%Y', errors='coerce')

training_data['Departure_Day'] = training_data['Departure Date'].dt.day  
training_data['Departure_Month'] = training_data['Departure Date'].dt.month  
training_data['Departure_Year'] = training_data['Departure Date'].dt.year  

In [6]:
training_data["New_Date"] = pd.to_datetime(
    training_data[["Departure_Year", "Departure_Month", "Departure_Day"]].rename(
    columns={"Departure_Year": "year", "Departure_Month": "month", "Departure_Day": "day"}
))
training_data["Departure_Weekday"] = training_data["New_Date"].dt.weekday


In [9]:
training_data = training_data.merge(air_map, on='Airport Country Code', how='left')

In [10]:
country_discounts = training_data.groupby("Country Name")["Eligible_For_Discount"].value_counts(normalize=True).unstack()
country_discounts.reset_index(inplace=True)

print(country_discounts)

country_discounts.to_csv("country_discounts.csv", index=False)

Eligible_For_Discount       Country Name        No       Yes
0                            Afghanistan  0.512953  0.487047
1                                Albania  0.800000  0.200000
2                                Algeria  0.516260  0.483740
3                         American Samoa  0.560000  0.440000
4                                Andorra  0.666667  0.333333
..                                   ...       ...       ...
230                    Wallis and Futuna  0.333333  0.666667
231                       Western Sahara  0.583333  0.416667
232                                Yemen  0.494624  0.505376
233                               Zambia  0.504587  0.495413
234                             Zimbabwe  0.458824  0.541176

[235 rows x 3 columns]


In [11]:
gender_discounts = training_data.groupby("Gender")["Eligible_For_Discount"].value_counts(normalize=True).unstack()
gender_discounts.reset_index(inplace=True)

print(gender_discounts)

gender_discounts.to_csv("gender_discounts.csv", index=False)

Eligible_For_Discount  Gender        No       Yes
0                      Female  0.378608  0.621392
1                        Male  0.594413  0.405587


In [13]:
weekday_discounts = training_data.groupby("Departure_Weekday")["Eligible_For_Discount"].value_counts(normalize=True).unstack()
weekday_discounts.reset_index(inplace=True)

print(weekday_discounts)

weekday_discounts.to_csv("weekday_discounts.csv", index=False)

Eligible_For_Discount  Departure_Weekday        No       Yes
0                                      0  0.472818  0.527182
1                                      1  0.497435  0.502565
2                                      2  0.488021  0.511979
3                                      3  0.475560  0.524440
4                                      4  0.474124  0.525876
5                                      5  0.472727  0.527273
6                                      6  0.475381  0.524619


In [14]:
month_discounts = training_data.groupby("Departure_Month")["Eligible_For_Discount"].value_counts(normalize=True).unstack()
month_discounts.reset_index(inplace=True)

print(month_discounts)

month_discounts.to_csv("month_discounts.csv", index=False)

Eligible_For_Discount  Departure_Month        No       Yes
0                                    1       NaN  1.000000
1                                    2  0.629135  0.370865
2                                    3  0.626521  0.373479
3                                    4  0.633403  0.366597
4                                    5  0.627551  0.372449
5                                    6  0.633188  0.366812
6                                    7  0.640129  0.359871
7                                    8  0.623160  0.376840
8                                    9  0.623410  0.376590
9                                   10  0.619377  0.380623
10                                  11  0.637594  0.362406
11                                  12       NaN  1.000000


In [15]:
filtered_by_age = training_data[training_data['Age'] == 86]

filtered_by_age.to_csv('filtered_by_age.csv', index=False)

In [18]:
def predict_discount(row):

    if row['Age'] <= 4 or row['Age'] >= 86:
        return 'Yes'
    
    if row['Departure_Month'] in [1, 12]:
        return 'Yes'
    
    #eligible_country_codes = ["AG", "BB", "BY", "BT", "IO", "BN", "BI", "CN", "DK", "CY", "DM", "FK", "GM", "GI", "GG", "JM", "KW", "LV", "MT", "YT", "NE", "NF", "PW"]
    eligible_country_codes = [ "NF"]
    if row['Airport Country Code'] in eligible_country_codes:
        return 'Yes'

    #if row['Ticket Price'] < 200:
     #   return 'Yes'
    
    # Gender and high ticket price discount
    #if row['Gender'] == 'Female' and row['Ticket Price'] <= 600 and row['Ticket Price'] >= 4000 : 
     #   return 'Yes'
    
    # Females get discounts if their ticket price is divisible by 4
    if row["Gender"] == "Female" and row["Ticket Price"] % 4 == 0:
        return 'Yes'

    
    if row["Nationality"] == row["Country Name"]:
        return 'Yes'  
    
    return 'No'

training_data['Predicted_Eligible_For_Discount'] = training_data.apply(predict_discount, axis=1)


In [19]:
accuracy = (training_data['Predicted_Eligible_For_Discount'] == training_data['Eligible_For_Discount']).mean()
print(f'Accuracy of the model: {accuracy * 100:.2f}%')

Accuracy of the model: 100.00%


In [20]:
training_data[["ID", "Predicted_Eligible_For_Discount"]].to_csv("predictions.csv", index=False)

In [21]:
testing_data = pd.read_csv("prediction_challenge_test.csv")

In [27]:
testing_data['Departure Date'] = pd.to_datetime(testing_data['Departure Date'])

testing_data['Departure_Month'] = testing_data['Departure Date'].dt.month

if 'Country Name' not in testing_data.columns:
    testing_data = testing_data.merge(air_map, on='Airport Country Code', how='left')
testing_data['Eligible_For_Discount'] = testing_data.apply(predict_discount, axis=1)

testing_data.to_csv("testing_predictions.csv", index=False)