# Baseline Classification with Macro Data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# feature selection
from info_gain import info_gain
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
# models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Macro Data Preprocessing

In [2]:
df = pd.read_csv('../data/macro/macro.csv')

In [3]:
df.head()

Unnamed: 0,date,GDPC1,CPIAUCSL,UNRATE,PAYEMS,RRSFS,GFDEBTN,VIXCLS,DFF
0,2006-01-31,15267.026,199.3,4.7,135429.0,179293.0,8371156.0,12.95,4.47
1,2006-02-28,15278.919,199.4,4.8,135737.0,177887.0,8387451.0,12.34,4.52
2,2006-03-31,15290.812,199.7,4.7,136047.0,178100.0,8403747.0,11.39,5.0
3,2006-04-30,15302.705,200.7,4.7,136205.0,178088.0,8420042.0,11.59,4.86
4,2006-05-31,15310.592667,201.3,4.6,136244.0,177200.0,8449019.0,16.44,5.05


In [4]:
def preprocessing(data):
    '''
    Description: Drop date, generate target 
                 variable column, 
                 and drop first row.

    Input:
    * Pandas DataFrame: Dataframe to be preprocessed.

    Return:
    * Pandas DataFrame: Dataframe with target column
                        generated, and first row 
                        dropped.
    '''
    
    data['diff'] = data['DFF'].diff()
    data['target'] = [1 if x > 0.01 
                      else -1 if x < -0.01 
                      else 0 
                      for x in data['diff']]
    
    to_drop = ['date', 'DFF','diff']
    data.drop(to_drop, axis=1, inplace=True)
    data = data.iloc[1:,:].reset_index().iloc[:,1:]
    
    return data

In [5]:
df = preprocessing(df)

In [6]:
df.head()

Unnamed: 0,GDPC1,CPIAUCSL,UNRATE,PAYEMS,RRSFS,GFDEBTN,VIXCLS,target
0,15278.919,199.4,4.8,135737.0,177887.0,8387451.0,12.34,1
1,15290.812,199.7,4.7,136047.0,178100.0,8403747.0,11.39,1
2,15302.705,200.7,4.7,136205.0,178088.0,8420042.0,11.59,-1
3,15310.592667,201.3,4.6,136244.0,177200.0,8449019.0,16.44,1
4,15318.480333,201.8,4.6,136325.0,177343.0,8477997.0,13.08,0


In [7]:
target = pd.DataFrame(df.target)
target.to_csv("../data/target/target.csv", index=False)

In [8]:
pd.DataFrame(df.target.value_counts())

Unnamed: 0,target
1,67
-1,62
0,52


### Train Test Split

In [9]:
X = df.drop(columns=['target'])
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    test_size=0.2, 
                                                    shuffle=False)

### Feature Scaling

In [10]:
pd.DataFrame(X_train[:5])

Unnamed: 0,GDPC1,CPIAUCSL,UNRATE,PAYEMS,RRSFS,GFDEBTN,VIXCLS
0,15278.919,199.4,4.8,135737.0,177887.0,8387451.0,12.34
1,15290.812,199.7,4.7,136047.0,178100.0,8403747.0,11.39
2,15302.705,200.7,4.7,136205.0,178088.0,8420042.0,11.59
3,15310.592667,201.3,4.6,136244.0,177200.0,8449019.0,16.44
4,15318.480333,201.8,4.6,136325.0,177343.0,8477997.0,13.08


In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
pd.DataFrame(X_train[:5])

Unnamed: 0,0,1,2,3,4,5,6
0,-1.129064,-1.967195,-0.918581,-0.325537,0.028569,-1.628915,-0.798688
1,-1.116972,-1.944672,-0.970372,-0.263516,0.048022,-1.624793,-0.90669
2,-1.104879,-1.869597,-0.970372,-0.231906,0.046926,-1.620671,-0.883952
3,-1.096859,-1.824551,-1.022164,-0.224103,-0.034174,-1.613342,-0.332573
4,-1.088839,-1.787014,-1.022164,-0.207898,-0.021114,-1.606012,-0.714559


### Model Selection (All Features)

#### Support Vector Classification, Decision Tree, Logistic Regression with Time Series Split

In [13]:
classifiers = []
classifiers.extend([("SVC", SVC(random_state=1)), 
                    ("Decision Tree", DecisionTreeClassifier(random_state=1)), 
                    ("Logistic Regression", LogisticRegression(random_state=1, max_iter=200))])

In [14]:
results = []
classifier_name = []
for classifier in classifiers:
    tscv = TimeSeriesSplit(n_splits=5)
    all_y_preds = []
    all_y_true = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index[0]:train_index[-1]+1], X[test_index[0]:test_index[-1]+1]
        y_train, y_test = y[train_index[0]:train_index[-1]+1], y[test_index[0]:test_index[-1]+1]
        classifier[1].fit(X_train, y_train)
        y_preds = classifier[1].predict(X_test)
        all_y_preds.extend(y_preds)
        all_y_true.extend(y_test)
    results.append(f1_score(all_y_true,all_y_preds, average='weighted'))
    classifier_name.append(classifier[0])

In [15]:
baseline_results = pd.DataFrame([classifier_name, results]).transpose()\
                                                    .rename(columns={0:'Model',\
                                                                     1:'F1-Weighted'})\
                                                    .sort_values(by='F1-Weighted', \
                                                                 ascending=False)\
                                                    .reset_index().iloc[:,1:]
baseline_results

Unnamed: 0,Model,F1-Weighted
0,Decision Tree,0.389369
1,Logistic Regression,0.340643
2,SVC,0.29531
