# Feature Engineering

- Координаты городов. Чтобы в основном найти расстояния.
- Перелеты внутри государства (расстояние между городами). Скорее всего цена зависит от расхода топлива -> от расстояния
- Кластеризация.
- Дата: выходной.
- Скорость: расстояние / время полета.
- Часто встречаемые авиалинии.
- Погода.
- Время полета (день, утро, вечер).
- В какую минуту дня самолет вылетел.

In [172]:
import pandas as pd

In [173]:
df = pd.read_csv('features.csv', parse_dates=['date'], sep=',')

### Координаты городов

city_name -> [lat, lon]

str -> [float, float]

москва -> [63, 128]

In [139]:
from mapbox import Geocoder

In [140]:
access_token = 'pk.eyJ1IjoiZWdvcm92bSIsImEiOiJjanhic2F1ZDUwMTdvNDJ0bHMxOGg5NWthIn0.QL2wIcgAJIBNIz2gbpuLVQ'

In [141]:
geocoder = Geocoder(access_token=access_token)

In [142]:
all_cities = df['from'].append(df['to']).unique()

In [143]:
city_coordinate = {}

for city_name in all_cities:
    response = geocoder.forward(city_name)
    place_name = response.json()['features'][0]['place_name']
    coordinates = response.json()['features'][0]['geometry']['coordinates']
    
    city_coordinate[city_name] = coordinates

In [144]:
city_coordinate

{'Delhi': [77.21667, 28.66667],
 'Mumbai': [72.83333, 18.96667],
 'Bangalore': [77.59796, 12.96991],
 'Kolkata': [88.33778, 22.54111],
 'Hyderabad': [78.46667, 17.36667],
 'Chennai': [80.27, 13.09]}

In [145]:
from_coordinates_lat = []
from_coordinates_lon = []

for city_name in df['from']:
    from_coordinates_lat.append(city_coordinate[city_name][1]) # lat
    from_coordinates_lon.append(city_coordinate[city_name][0]) # lon
    
    
to_coordinates_lat = []
to_coordinates_lon = []

for city_name in df['to']:
    to_coordinates_lat.append(city_coordinate[city_name][1])
    to_coordinates_lon.append(city_coordinate[city_name][0])

In [146]:
df['from_coordinates_lat'] = from_coordinates_lat
df['from_coordinates_lon'] = from_coordinates_lon

df['to_coordinates_lat'] = to_coordinates_lat
df['to_coordinates_lon'] = to_coordinates_lon

In [147]:
import haversine

In [148]:
# haversine.haversine((1, 1), (2, 4))

In [149]:
distances = []

for i in range(len(from_coordinates_lat)):
    from_lat = from_coordinates_lat[i]
    from_lon = from_coordinates_lon[i]
    
    to_lat = to_coordinates_lat[i]
    to_lon = to_coordinates_lon[i]
    
    distance = haversine.haversine((from_lat, from_lon), (to_lat, to_lon))
    
    distances.append(distance)

In [150]:
df['distance'] = distances

In [151]:
df['speed'] = df['distance'] / (df['taken_time_minues'] / 60)

In [152]:
df['week_day'] = df['date'].dt.weekday

In [153]:
df.head()

Unnamed: 0,date,airline,dep_time,from,stop,to,price,is_econom,taken_time_minues,from_coordinates_lat,from_coordinates_lon,to_coordinates_lat,to_coordinates_lon,distance,speed,week_day
0,2022-11-02,Air India,18:00,Delhi,non-stop,Mumbai,25612,0,120.0,28.66667,77.21667,18.96667,72.83333,1166.798334,583.399167,2
1,2022-11-02,Air India,19:00,Delhi,non-stop,Mumbai,25612,0,135.0,28.66667,77.21667,18.96667,72.83333,1166.798334,518.577037,2
2,2022-11-02,Air India,20:00,Delhi,1-stop,Mumbai,42220,0,1485.0,28.66667,77.21667,18.96667,72.83333,1166.798334,47.143367,2
3,2022-11-02,Air India,21:25,Delhi,1-stop,Mumbai,44450,0,1590.0,28.66667,77.21667,18.96667,72.83333,1166.798334,44.030126,2
4,2022-11-02,Air India,17:15,Delhi,1-stop,Mumbai,46690,0,400.0,28.66667,77.21667,18.96667,72.83333,1166.798334,175.01975,2


In [154]:
df['dep_minutes'] = df.dep_time.apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

In [156]:
df['stop_count'] = df.stop.map({'non-stop': 0, '1-stop': 1, '2+-stop': 2})

In [165]:
new_airlines = []

for airline in df.airline:
    if airline != 'Vistara' and airline != 'Air India' and airline != 'Indigo':
        new_airlines.append('Other')
    else:
        new_airlines.append(airline)

In [166]:
df['new_airlines'] = new_airlines

In [170]:
df.to_csv('new_features.csv', index=False)