In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Загрузим данные (пример)
data = pd.read_csv('Clean_Dataset.csv')
# Оставим только нужные признаки
features = ['airline', 'source_city', 'destination_city', 'departure_time', 'class', 'duration', 'days_left']
target='price'
df=data[features + [target]]
X = df[features]
y = df['price']


# Обновляем названия столбцов для X
X.columns = X.columns.astype(str)

# Преобразуем категориальные признаки в числовые с указанием категорий
categories = [
    sorted(df['airline'].unique()),
    sorted(df['source_city'].unique()),
    sorted(df['destination_city'].unique()),
    sorted(df['departure_time'].unique()),
    sorted(df['class'].unique())
]

encoder = OneHotEncoder(categories=categories, sparse_output=True, handle_unknown='ignore')
encoder.fit(X[['airline', 'source_city', 'destination_city', 'departure_time', 'class']])

# Применяем преобразование ко всему набору данных, включая числовые признаки
X_encoded = encoder.transform(X[['airline', 'source_city', 'destination_city', 'departure_time', 'class']])
X_encoded = pd.concat([pd.DataFrame(X_encoded.toarray()), X[['duration', 'days_left']].reset_index(drop=True)], axis=1)

# Обновляем названия столбцов для X_encoded
X_encoded.columns = X_encoded.columns.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [40]:
df.head()

Unnamed: 0,airline,source_city,destination_city,departure_time,class,duration,days_left,price
0,SpiceJet,Delhi,Mumbai,Evening,Economy,2.17,1,5953
1,SpiceJet,Delhi,Mumbai,Early_Morning,Economy,2.33,1,5953
2,AirAsia,Delhi,Mumbai,Early_Morning,Economy,2.17,1,5956
3,Vistara,Delhi,Mumbai,Morning,Economy,2.25,1,5955
4,Vistara,Delhi,Mumbai,Morning,Economy,2.33,1,5955


In [41]:
# Линейная регрессия
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
joblib.dump(lr_model, 'lr_model.pkl')


['lr_model.pkl']

In [48]:
# дерево решений
dt_model = DecisionTreeRegressor(max_depth=25, min_samples_split=5, min_samples_leaf=2, random_state=42)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
joblib.dump(dt_model, 'dt_model.pkl')


['dt_model.pkl']

In [49]:

#случайный лес
rf_model = RandomForestRegressor(n_estimators=100, max_depth=25, min_samples_split=5, min_samples_leaf=2, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
joblib.dump(rf_model, 'rf_model.pkl')


['rf_model.pkl']

In [50]:
# Обучение модели XGBoost 
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.9, colsample_bytree=0.9,random_state=42) 
xgb_model.fit(X_train, y_train) 
xgb_predictions = xgb_model.predict(X_test) # Сохраним модель XGBoost 
joblib.dump(xgb_model, 'xgb_model.pkl') # Оценка модели XGBoost 


['xgb_model.pkl']

In [52]:
# Функция для оценки моделей
def evaluate_model(predictions, y_test):
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

# Оценим модели
lr_mse, lr_r2 = evaluate_model(lr_predictions, y_test)
dt_mse, dt_r2 = evaluate_model(dt_predictions, y_test)
rf_mse, rf_r2 = evaluate_model(rf_predictions, y_test)
xgb_mse, xgb_r2 = evaluate_model(xgb_predictions, y_test)

# Выведем результаты
print(f"Линейная регрессия: MSE = {lr_mse}, R^2 = {lr_r2}")
print(f"Дерево решений: MSE = {dt_mse}, R^2 = {dt_r2}")
print(f"Случайный лес: MSE = {rf_mse}, R^2 = {rf_r2}")
print(f"XGBoost: MSE = {xgb_mse}, R^2 = {xgb_r2}")


Линейная регрессия: MSE = 50328107.65746864, R^2 = 0.9023669479281436
Дерево решений: MSE = 10378322.163952606, R^2 = 0.9798667719607511
Случайный лес: MSE = 7688712.132690924, R^2 = 0.9850844296168341
XGBoost: MSE = 17850612.526231863, R^2 = 0.9653710448612607


In [53]:
df.to_csv("processed_flight_data_.csv", index=False)
