# Feature Engineering 

The exploratory data analysis section showed that oil producing countries have a high correlation to each other. To simplify our features all oil producers will be averaged and combined into a single feature

# Imports 

In [37]:
import pandas as pd
import sys 
import os 

# Manually add path to read from another folder 
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from cleaning import oil_api_keys
from feature_engineering import *
from eda import *

In [38]:
features_df = pd.read_csv("../data/cleaned_dataset.csv")
features_df.set_index(features_df['Date'],inplace=True)
features_df.drop(columns='Date',inplace=True)

In [39]:
features_df.head()

Unnamed: 0_level_0,crude_price,sp_500,vix,usd,libya_oil_production,kazak_oil_production,qatar_oil_production,iran_oil_production,kuwait_oil_production,uae_oil_production,saudi_oil_production,iraq_oil_production,usa_oil_production,world_oil_consumption
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-01-01,20.605,1155.25,22.27,117.885,1383184.0,972101.1616,694246.5753,3248200.0,1745866.0,1930000.0,7090000.0,1823000.0,5873000.0,43650.734
2002-01-02,21.01,1154.0,22.709999,116.269997,1383184.0,972101.1616,694246.5753,3248200.0,1745866.0,1930000.0,7090000.0,1823000.0,5873000.0,43650.734
2002-01-03,20.370001,1166.75,21.34,116.540001,1383184.0,972101.1616,694246.5753,3248200.0,1745866.0,1930000.0,7090000.0,1823000.0,5873000.0,43650.734
2002-01-04,21.620001,1175.25,20.450001,116.75,1383184.0,972101.1616,694246.5753,3248200.0,1745866.0,1930000.0,7090000.0,1823000.0,5873000.0,43650.734
2002-01-05,20.710001,1161.3125,22.075,116.692499,1383184.0,972101.1616,694246.5753,3248200.0,1745866.0,1930000.0,7090000.0,1823000.0,5873000.0,43650.734


Averaging all the oil producing countries' to create a new feature 

In [40]:
features_df = oil_production_feature(features_df, oil_api_keys)
features_df

Unnamed: 0_level_0,crude_price,sp_500,vix,usd,world_oil_consumption,world_oil_production
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-01-01,20.605000,1155.2500,22.270000,117.885000,43650.734,2.751066e+06
2002-01-02,21.010000,1154.0000,22.709999,116.269997,43650.734,2.751066e+06
2002-01-03,20.370001,1166.7500,21.340000,116.540001,43650.734,2.751066e+06
2002-01-04,21.620001,1175.2500,20.450001,116.750000,43650.734,2.751066e+06
2002-01-05,20.710001,1161.3125,22.075000,116.692499,43650.734,2.751066e+06
...,...,...,...,...,...,...
2023-01-03,76.930000,3846.0000,22.900000,104.311996,53494.272,4.289287e+06
2023-01-04,72.839996,3874.5000,22.010000,104.021004,53561.130,4.289287e+06
2023-01-05,73.669998,3829.0000,22.459999,104.828003,53672.560,4.289287e+06
2023-01-06,73.769997,3915.5000,21.129999,103.646004,53895.420,4.289287e+06


To eliminate any noise from the data, the 7 day rolling average will be taken 

In [41]:
# features_df = rolling_mean_feature(features_df, 7)
# features_df

In [42]:
features_df.to_csv('../data/feature_engineered_dataset.csv')

# Summary 

- Two new features were created to improve the machine learning models' accuracy. 
- Oil production data was averaged to decrease the model having an over reliance on a single country and to smooth the data volatility. 
- The 7 day rolling mean was created to smooth data volatility. 