In [17]:
import os
import pandas as pd
from modules.data_preprocessing import DataPreprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from modules.feature_engineering import FeatureEngineeringV1


In [9]:
data_preprocessing = DataPreprocessing()
fe = FeatureEngineeringV1()

In [10]:
directory_prefix = '../data/excel_raw/'
lst_of_files = [os.path.join(directory_prefix, file) for file in os.listdir(directory_prefix)]
lst_of_files

['../data/excel_raw/20230713T153526004.att.672322581406152.xlsx',
 '../data/excel_raw/20230713T153529650.att.1333037354235214.xlsx']

In [11]:
csv_path='../data/csv/raw.csv'
data_preprocessing.handle_init_files(lst_of_files, csv_path)

The file '../data/csv/raw.csv' already exists. Skipping file creation.


In [15]:
raw = pd.read_csv('../data/csv/raw.csv')
raw

Unnamed: 0,wind_speed_TSN,wind_dir_TSN,wind_speed_KADUM,wind_dir_KADUM,wind_speed_PATNA,wind_dir_PATNA,wind_speed_SADAS,wind_dir_SADAS,wind_speed_DADEN,wind_dir_DADEN,...,Route,Ac_id,Ac_code,Ac_type,Ac_kts,Time_step,Phase,Ac_Lat,Ac_Lon,Ac_feet
0,3.2,130.9,1.1,130.3,2.4,202.5,0.8,142.5,1.6,141.7,...,6,R6-VJ150-21,VJ150,1,170.0,3,1,10.8134,106.6427,625.0
1,3.2,130.9,1.1,130.4,2.4,202.9,0.8,142.6,1.6,141.5,...,6,R6-VJ150-21,VJ150,1,179.0,23,1,10.8075,106.6271,1425.0
2,3.2,130.9,1.1,130.4,2.4,202.9,0.8,142.6,1.6,141.5,...,6,R6-VJ150-21,VJ150,1,205.0,43,1,10.8029,106.6096,1800.0
3,3.2,130.9,1.1,130.5,2.4,203.3,0.7,142.8,1.6,141.4,...,6,R6-VJ150-21,VJ150,1,231.0,59,1,10.8063,106.5941,2025.0
4,3.2,130.9,1.1,130.5,2.4,203.3,0.7,142.8,1.6,141.4,...,6,R6-VJ150-21,VJ150,1,255.0,75,1,10.8204,106.5810,2475.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61383,1.9,137.2,1.8,42.6,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,157.0,5344,1,21.2532,105.6876,1775.0
61384,1.9,137.2,1.8,42.6,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,152.0,5362,1,21.2493,105.7010,1525.0
61385,1.9,137.2,1.8,42.2,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,146.0,5378,1,21.2461,105.7121,1325.0
61386,1.9,137.2,1.8,42.2,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,131.0,5412,1,21.2401,105.7331,950.0


In [16]:
processed_df = fe.process_data('../data/csv/raw.csv')
processed_df

Unnamed: 0,wind_speed_TSN,wind_dir_TSN,wind_speed_KADUM,wind_dir_KADUM,wind_speed_PATNA,wind_dir_PATNA,wind_speed_SADAS,wind_dir_SADAS,wind_speed_DADEN,wind_dir_DADEN,...,Route,Ac_id,Ac_code,Ac_kts,Time_step,Ac_Lat,Ac_Lon,Ac_feet,0,0.1


In [19]:
scalers = {
			'wind_speed': StandardScaler(),
			'wind_dir'  : StandardScaler(),
			'lat_lon'   : StandardScaler(),
			'altitude'  : StandardScaler(),
			'speed'     : StandardScaler(),
			'time_step' : StandardScaler()
		}
encoders = {
			'ac_type': OneHotEncoder(drop='first'),
			'phase'  : OneHotEncoder(drop='first'),
		}
def common_preprocessing(df):
    df = df.dropna()
    df = df.drop_duplicates().reset_index(drop=True)
    return df

def normalize_features(df):
    wind_speed_columns = df.columns[df.columns.str.startswith('wind_speed')]
    wind_dir_columns = df.columns[df.columns.str.startswith('wind_dir')]
    lat_lon_columns = ['Ac_Lat', 'Ac_Lon']
    altitude_columns = ['Ac_feet']
    speed_columns = ['Ac_kts']
    time_step_column = ['Time_step']
    
    df[wind_speed_columns] = scalers['wind_speed'].fit_transform(df[wind_speed_columns])
    df[wind_dir_columns] = scalers['wind_dir'].fit_transform(df[wind_dir_columns])
    df[lat_lon_columns] = scalers['lat_lon'].fit_transform(df[lat_lon_columns])
    df[altitude_columns] = scalers['altitude'].fit_transform(df[altitude_columns])
    df[speed_columns] = scalers['speed'].fit_transform(df[speed_columns])
    df[time_step_column] = scalers['time_step'].fit_transform(df[time_step_column])
    
    ac_type_encoded = encoders['ac_type'].fit_transform(df[['Ac_type']])
    phase_encoded = encoders['phase'].fit_transform(df[['Phase']])
    
    df = df.drop(['Ac_type', 'Phase'], axis=1)
    df = pd.concat([df, pd.DataFrame(ac_type_encoded), pd.DataFrame(phase_encoded)], axis=1)
    return df

def remove_unreasonable_time(df, max_threshold=10000, min_flight_time=4000):
    cleaned_df = df[df['Time_step'] < max_threshold]
    flights_to_remove = cleaned_df.groupby('Ac_id')['Time_step'].max() < min_flight_time
    id_to_remove = flights_to_remove[flights_to_remove].index
    cleaned_df = cleaned_df[~cleaned_df['Ac_id'].isin(id_to_remove)]
    return cleaned_df

In [21]:
processed_df = common_preprocessing(raw)
processed_df

Unnamed: 0,wind_speed_TSN,wind_dir_TSN,wind_speed_KADUM,wind_dir_KADUM,wind_speed_PATNA,wind_dir_PATNA,wind_speed_SADAS,wind_dir_SADAS,wind_speed_DADEN,wind_dir_DADEN,...,Route,Ac_id,Ac_code,Ac_type,Ac_kts,Time_step,Phase,Ac_Lat,Ac_Lon,Ac_feet
0,3.2,130.9,1.1,130.3,2.4,202.5,0.8,142.5,1.6,141.7,...,6,R6-VJ150-21,VJ150,1,170.0,3,1,10.8134,106.6427,625.0
1,3.2,130.9,1.1,130.4,2.4,202.9,0.8,142.6,1.6,141.5,...,6,R6-VJ150-21,VJ150,1,179.0,23,1,10.8075,106.6271,1425.0
2,3.2,130.9,1.1,130.4,2.4,202.9,0.8,142.6,1.6,141.5,...,6,R6-VJ150-21,VJ150,1,205.0,43,1,10.8029,106.6096,1800.0
3,3.2,130.9,1.1,130.5,2.4,203.3,0.7,142.8,1.6,141.4,...,6,R6-VJ150-21,VJ150,1,231.0,59,1,10.8063,106.5941,2025.0
4,3.2,130.9,1.1,130.5,2.4,203.3,0.7,142.8,1.6,141.4,...,6,R6-VJ150-21,VJ150,1,255.0,75,1,10.8204,106.5810,2475.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61380,1.9,137.2,1.8,42.6,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,157.0,5344,1,21.2532,105.6876,1775.0
61381,1.9,137.2,1.8,42.6,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,152.0,5362,1,21.2493,105.7010,1525.0
61382,1.9,137.2,1.8,42.2,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,146.0,5378,1,21.2461,105.7121,1325.0
61383,1.9,137.2,1.8,42.2,5.3,37.8,3.4,95.0,3.9,97.4,...,6,R6-VN258-04,VN258,2,131.0,5412,1,21.2401,105.7331,950.0


In [None]:
processed_df = remove_unreasonable_time(processed_df)
processed_df

In [14]:
decoded_df = fe.decode_features(processed_df.copy())
decoded_df

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required.

In [None]:
processed_df.to_csv('../data/csv/processed.csv', index=False)