### Load Libraries & Datasets

#### Libraries

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sklearn.preprocessing as preprocessing
import statsmodels.api as sm
import statsmodels as sm
import sys

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Datasets

In [14]:
gasoline_proc = pd.read_csv('../../data/03_processed/gasoline_processed.csv')
astm = pd.read_csv('../../data/01_raw/ASTM_fuel.csv')
astm.columns = ['Date', 'TN_retailers_seasons', 'TN_distributor_seasons',
       'vapor_liquid_minC_retail', 'distillation_50_minC _retail',
       'distillation_50_maxC_retail', 'vapor_pressure_maxC_retail',
       'vapor_liquid_minC_dist', 'distillation_50_minC_dist',
       'distillation_50_maxC_dist', 'vapor_pressure_maxC_dist']

#### Process Datasets

```python
# turn datasampled to datetime 
gasoline_proc['datesampled'] = pd.to_datetime(gasoline_proc['datesampled'])

# strip all astm columns of any white space
for col in astm.columns: 
    try:
        astm[col] = astm[col].str.strip()
    except AttributeError: 
        pass

# drop columns 
gasoline_proc.drop(columns=['zipcode'], inplace=True)

# drop na values
gasoline_proc.dropna(subset=['grade'], inplace=True)
gasoline_proc.dropna(inplace=True)

# prepare to join ASTM dataset to gasoline_proc set 
gasoline_proc['datesampled_month'] = gasoline_proc.datesampled.dt.month
gasoline_proc['datesampled_day'] = gasoline_proc['datesampled'].dt.day
gasoline_proc['datesampled_month_day'] = gasoline_proc['datesampled_month'].astype('str') + '/' + gasoline_proc['datesampled_day'].astype('str')

gasoline_proc.rename(columns={'datesampled_month_day':'Date'}, inplace=True)

# join ASTM and gasoline_proc datasets
gasoline_proc = gasoline_proc.merge(astm, 
               how='left', 
                on='Date'
               )
gasoline_proc.reset_index(inplace=True, drop=True)
```

In [11]:
def process_data_for_model_building(gasoline, astm):
    # turn datasampled to datetime 
    gasoline['datesampled'] = pd.to_datetime(gasoline['datesampled'])
    # strip all astm columns of any white space
    for col in astm.columns: 
        try:
            astm[col] = astm[col].str.strip()
        except AttributeError: 
            pass
    
    # drop columns 
    gasoline.drop(columns=['zipcode'], inplace=True)
    # drop na values
    gasoline.dropna(subset=['grade'], inplace=True)
    gasoline.dropna(inplace=True)
    
    # prepare to join ASTM dataset to gasoline_proc set 
    gasoline['datesampled_month'] = gasoline.datesampled.dt.month
    gasoline['datesampled_day'] = gasoline['datesampled'].dt.day
    gasoline['datesampled_month_day'] = gasoline['datesampled_month'].astype('str') + '/' + gasoline['datesampled_day'].astype('str')
    gasoline.rename(columns={'datesampled_month_day':'Date'}, inplace=True)
    
    # join ASTM and gasoline_proc datasets
    gasoline = gasoline.merge(astm, 
                                        how='left', 
                                        on='Date'
                                       )
    gasoline.reset_index(inplace=True, drop=True)
    return gasoline
    

In [12]:
process_data_for_model_building(gasoline_proc, astm)

Unnamed: 0,Sample,prod,datesampled,grade,supplier,facilityname,siteaddress,units_dist_50,units_vap_pressure,units_vap_liq_pressure,...,TN_retailers_seasons,TN_distributor_seasons,vapor_liquid_minC_retail,distillation_50_minC _retail,distillation_50_maxC_retail,vapor_pressure_maxC_retail,vapor_liquid_minC_dist,distillation_50_minC_dist,distillation_50_maxC_dist,vapor_pressure_maxC_dist
0,61916134,Gasoline,2015-11-23,Mid Grade Unleaded,Marathon Petroleum Lp,Circle K #2723609,"198 Haywood Ln \r\nnashville, Tn 37211",Deg. C,kPa,Deg. C,...,C-3/D-4,C-3/D-4,42.0,77.0,116.0,93.0,42.0,77.0,116.0,93.0
1,61916136,Gasoline,2015-11-24,Mid Grade Unleaded,Tri-star Energy,Twice Daily #8085,"648 Thompson Ln \r\nnashville, Tn 37204",Deg. C,kPa,Deg. C,...,C-3/D-4,C-3/D-4,42.0,77.0,116.0,93.0,42.0,77.0,116.0,93.0
2,61916138,Gasoline,2015-11-24,Regular Unleaded,"Mapco Express, Inc.",Mapco Express #3195,"4677 Trousdale Dr Nashville, Tn 37204",Deg. C,kPa,Deg. C,...,C-3/D-4,C-3/D-4,42.0,77.0,116.0,93.0,42.0,77.0,116.0,93.0
3,61916139,Gasoline,2015-12-02,Premium Unleaded,Tri-star Energy,Top It Off Holding Inc,"13016 Old Hickory Blvd \r\nantioch, Tn 37013",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
4,61916140,Gasoline,2015-12-02,Regular Unleaded,"Mapco Express, Inc.",Mapco Express #1030,"2616 Franklin Road Nashville, Tn 37204",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
5,61916142,Gasoline,2015-12-02,Mid Grade Unleaded,"Mapco Express, Inc.",Mapco Mart #3410,"4314 Harding Rd \r\nnashville, Tn 37205",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
6,61916148,Gasoline,2015-12-07,Mid Grade Unleaded,Exxonmobil,Kroger #550,"8175 Highway 100 \r\nnashville, Tn 37221",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
7,61916149,Gasoline,2015-12-07,Premium Unleaded,"Mapco Express, Inc.",Mapco Express #3414,"7670 Hwy. 70s Nashville, Tn 37221",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
8,61916150,Gasoline,2015-12-07,Regular Unleaded,Tri-star Energy,Dailys #6645,"7691 Highway 70 South Nashville, Tn 37221",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
9,61916158,Gasoline,2015-12-14,Regular Unleaded,Tri-star Energy,Bellvue Shell,"7395 Old Harding Rd \r\nnashville,, Tn 37221",Deg. C,kPa,Deg. C,...,D-4,D-4,42.0,77.0,113.0,93.0,42.0,77.0,113.0,93.0
