In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



### Train Model: forest and tree model to compare the prices  

In [2]:
data_clean = pd.read_csv(r'D:\spring 25\appiled stat\anika Sprint 1\data\Processed\data_clean.csv')
data_clean.head()

Unnamed: 0,Year,quarter,cur_passengers,cur_fare,ly_fare,ly_passengers,amount_change,percent_change,amount_change_pax,percent_change_pax,...,"route_Seattle, WA - Tucson, AZ","route_Seattle, WA - Washington, DC (Metropolitan Area)","route_St. Louis, MO - Tampa, FL (Metropolitan Area)","route_St. Louis, MO - Washington, DC (Metropolitan Area)","route_Syracuse, NY - Tampa, FL (Metropolitan Area)","route_Syracuse, NY - Washington, DC (Metropolitan Area)","route_Tampa, FL (Metropolitan Area) - Toledo, OH","route_Tampa, FL (Metropolitan Area) - Washington, DC (Metropolitan Area)","route_Tulsa, OK - Washington, DC (Metropolitan Area)","route_Washington, DC (Metropolitan Area) - West Palm Beach/Palm Beach, FL"
0,2024,4,34490,205.96,169.88,39340,36.08,0.2124,-4850,-0.1233,...,False,False,False,False,False,False,False,False,False,False
1,2024,4,29550,197.7,156.7,34430,41.0,0.2617,-4880,-0.1417,...,False,False,False,False,False,False,False,False,False,False
2,2024,4,24910,256.22,192.41,36680,63.81,0.3316,-11770,-0.3209,...,False,False,False,False,False,False,False,False,False,False
3,2024,4,40090,315.81,257.61,43130,58.19,0.2259,-3040,-0.0705,...,False,False,False,False,False,False,False,False,False,False
4,2024,4,82790,211.52,164.2,79590,47.32,0.2882,3200,0.0402,...,False,False,False,False,False,False,False,False,False,False


Data Preparation

In [3]:
# Define target
y = data_clean['cur_fare']

# Define features
route_cols = [col for col in data_clean.columns if col.startswith('route_')]
feature_cols = ['Year', 'quarter', 'ly_fare', 'ly_passengers', 'cur_passengers', 
                'amount_change_pax', 'percent_change_pax'] + route_cols

X = data_clean[feature_cols]

# Drop rows with NaNs in X or y
df_clean = data_clean[feature_cols + ['cur_fare']].dropna()
X = df_clean.drop(columns='cur_fare')
y = df_clean['cur_fare']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Fit model

In [5]:
tree_model = DecisionTreeRegressor(max_depth=5, random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)


In [6]:
forest_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)


Evaluate the model

In [7]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Results:")
    print("R² Score:", r2_score(y_true, y_pred))
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))

evaluate_model("Decision Tree", y_test, y_pred_tree)
evaluate_model("Random Forest", y_test, y_pred_forest)



Decision Tree Results:
R² Score: 0.817785830771564
MSE: 725.3059544428818
RMSE: 26.931504867773018

Random Forest Results:
R² Score: 0.9369213679360838
MSE: 251.0853443933504
RMSE: 15.845672734010078


Comment: much better results that our baseline linear regression model, the R2 was .2 and later .7 for baseline

### Test Model


In [8]:
# Create template with all columns
new_input = pd.DataFrame(columns=X.columns)
new_input.loc[0] = 0  # Set everything to zero first

# Set values for features 
#if i wann to travel from tampa to washington between to month of 3rd quarter (oct-dec)
new_input.at[0, 'Year'] = 2025
new_input.at[0, 'quarter'] = 2
new_input.at[0, 'ly_fare'] = 210
new_input.at[0, 'ly_passengers'] = 15000
new_input.at[0, 'cur_passengers'] = 14500

# must match the exact name 
#new_input.at[0, 'route_Tampa, FL (Metropolitan Area) - Washington, DC (Metropolitan Area)'] = 1



In [9]:
# Assuming your model is already trained
predicted_fare = forest_model.predict(new_input)

# Show the result
print("If the flight from Tampa to DC happens in Q2 of 2025, with 14,500 current passengers, last year had a fare of $210 and 15,000 passengers")
print("- Predicted Fare for this scenario: $", round(predicted_fare[0], 2))


If the flight from Tampa to DC happens in Q2 of 2025, with 14,500 current passengers, last year had a fare of $210 and 15,000 passengers
- Predicted Fare for this scenario: $ 249.83


In [10]:
predicted_fare = tree_model.predict(new_input)
print("Predicted Fare using Decision Tree: $", round(predicted_fare[0], 2))


Predicted Fare using Decision Tree: $ 249.07


the Two model produce similar results
