# Libraries

In [148]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression 

import warnings
warnings.filterwarnings("ignore")

# Data

In [141]:
data = pd.read_csv("./Datos ecopetrol/DATOS_PREDICTIVOS_EC.csv")
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(by='Date')
data = data.set_index("Date")
data["close1"] = data["Close"].shift(-1)
column_to_insert = data['close1']
data.drop(columns=['close1'], inplace=True)
data.insert(0, 'close1', column_to_insert)

data = data.drop(columns=["Close", "Open","diff","trend","open1"])

topic_columns = [col for col in data.columns if 'Topic' in col]
data[topic_columns] = data[topic_columns].apply(pd.to_numeric, errors='coerce').astype('Int64')

data=data.dropna()

# Get relevant features for only train data
date1 = "2020-07-12"
data = data.loc[data.index<=date1]

trend1 = data.drop(columns=["trend2","trend3","trend4","diff1","diff2","diff3","diff4","close1","close2","close3","close4"])
trend2 = data.drop(columns=["trend1","trend3","trend4","diff1","diff2","diff3","diff4","close1","close2","close3","close4"])
trend3 = data.drop(columns=["trend1","trend2","trend4","diff1","diff2","diff3","diff4","close1","close2","close3","close4"])
trend4 = data.drop(columns=["trend1","trend2","trend3","diff1","diff2","diff3","diff4","close1","close2","close3","close4"])

diff1 = data.drop(columns=["trend1","trend2","trend3","trend4","diff2","diff3","diff4","close1","close2","close3","close4"])
diff2 = data.drop(columns=["trend1","trend2","trend3","trend4","diff1","diff3","diff4","close1","close2","close3","close4"])
diff3 = data.drop(columns=["trend1","trend2","trend3","trend4","diff1","diff2","diff4","close1","close2","close3","close4"])
diff4 = data.drop(columns=["trend1","trend2","trend3","trend4","diff1","diff2","diff3","close1","close2","close3","close4"])

# SelectKBest for trends

## Categorical

In [126]:
dataframes = [trend1,trend2,trend3,trend4]

topic_columns = [col for col in data.columns if 'Topic' in col]

threshold = 1

objectives = []
final_list = []

for i in dataframes:
    X_train = i[topic_columns]
    y_train = i.iloc[:,:1]

    select_feature = SelectKBest(chi2, k=10).fit(X_train, y_train)

    selected_features_df = pd.DataFrame({'Feature':list(X_train.columns),
                                     'Scores':select_feature.scores_})
    
    features_list = selected_features_df[selected_features_df['Scores'] > threshold]['Feature'].tolist()

    objectives.append(i.iloc[:,:1].columns[0])
    final_list.append(features_list)

    print(i.iloc[:,:1].columns[0], features_list)

cat_features = pd.DataFrame({"Objective":objectives,"Variables":final_list})

trend1 ['Topic_7_appeared', 'Topic_24_appeared', 'Topic_10_appeared', 'Topic_33_appeared', 'Topic_19_appeared', 'Topic_31_appeared', 'Topic_17_appeared']
trend2 ['Topic_5_appeared', 'Topic_2_appeared', 'Topic_28_appeared', 'Topic_7_appeared', 'Topic_24_appeared', 'Topic_16_appeared', 'Topic_23_appeared', 'Topic_29_appeared', 'Topic_9_appeared', 'Topic_19_appeared', 'Topic_13_appeared', 'Topic_20_appeared', 'Topic_17_appeared']
trend3 ['Topic_2_appeared', 'Topic_16_appeared', 'Topic_4_appeared', 'Topic_29_appeared', 'Topic_32_appeared', 'Topic_18_appeared', 'Topic_22_appeared']
trend4 ['Topic_14_appeared', 'Topic_24_appeared', 'Topic_16_appeared', 'Topic_11_appeared', 'Topic_23_appeared', 'Topic_26_appeared', 'Topic_29_appeared', 'Topic_19_appeared', 'Topic_37_appeared', 'Topic_18_appeared']


## Escalar

In [124]:
dataframes = [trend1,trend2,trend3,trend4]

topic_columns = [col for col in data.columns if 'Topic' in col]

threshold = 1

objectives = []
final_list = []

for i in dataframes:

    X_train = i[i.columns.difference(topic_columns)]
    X_train = X_train.drop(columns=i.iloc[:,:1].columns[0])
    y_train = i.iloc[:,:1]

    select_feature = SelectKBest(f_classif, k=10).fit(X_train, y_train)

    selected_features_df = pd.DataFrame({'Feature':list(X_train.columns),
                                     'Scores':select_feature.scores_})
    
    features_list = selected_features_df[selected_features_df['Scores'] > threshold]['Feature'].tolist()

    objectives.append(i.iloc[:,:1].columns[0])
    final_list.append(features_list)

    print(i.iloc[:,:1].columns[0], features_list)

esc_features = pd.DataFrame({"Objective":objectives,"Variables":final_list})

trend1 ['average_headline_vader_10periods', 'title_vader_sentiment']
trend2 ['average_headline_vader_10periods', 'average_title_vader_10periods', 'headline_vader_sentiment', 'negative_vader', 'neutral', 'neutral_vader']
trend3 ['average_title_vader_10periods', 'headline_vader_sentiment', 'neutral', 'neutral_vader']
trend4 ['average_finbert_10periods', 'average_title_vader_10periods', 'neutral', 'neutral_vader']


# SelectKBest for volatility


## Categorical

In [153]:
dataframes = [diff1,diff2,diff3,diff4]

topic_columns = [col for col in data.columns if 'Topic' in col]

threshold = 1

objectives = []
final_list = []

for i in dataframes:
    X_train = i[topic_columns]
    y_train = i.iloc[:,:1]

    select_feature = SelectKBest(f_regression, k=10).fit(X_train, y_train)

    selected_features_df = pd.DataFrame({'Feature':list(X_train.columns),
                                     'Scores':select_feature.scores_})
    
    features_list = selected_features_df[selected_features_df['Scores'] > threshold]['Feature'].tolist()

    objectives.append(i.iloc[:,:1].columns[0])
    final_list.append(features_list)

    print(i.iloc[:,:1].columns[0], features_list)

cat_features = pd.DataFrame({"Objective":objectives,"Variables":final_list})

diff1 ['Topic_-1_appeared', 'Topic_5_appeared', 'Topic_7_appeared', 'Topic_21_appeared', 'Topic_14_appeared', 'Topic_24_appeared', 'Topic_12_appeared', 'Topic_4_appeared', 'Topic_23_appeared', 'Topic_29_appeared', 'Topic_33_appeared', 'Topic_19_appeared', 'Topic_20_appeared', 'Topic_30_appeared', 'Topic_32_appeared', 'Topic_17_appeared']
diff2 ['Topic_2_appeared', 'Topic_8_appeared', 'Topic_14_appeared', 'Topic_24_appeared', 'Topic_16_appeared', 'Topic_12_appeared', 'Topic_23_appeared', 'Topic_29_appeared', 'Topic_20_appeared', 'Topic_30_appeared', 'Topic_18_appeared', 'Topic_17_appeared']
diff3 ['Topic_2_appeared', 'Topic_7_appeared', 'Topic_14_appeared', 'Topic_24_appeared', 'Topic_16_appeared', 'Topic_12_appeared', 'Topic_11_appeared', 'Topic_23_appeared', 'Topic_29_appeared', 'Topic_38_appeared', 'Topic_33_appeared', 'Topic_19_appeared', 'Topic_20_appeared', 'Topic_17_appeared', 'Topic_22_appeared']
diff4 ['Topic_1_appeared', 'Topic_2_appeared', 'Topic_36_appeared', 'Topic_7_appear

## Escalar

In [155]:
dataframes = [diff1,diff2,diff3,diff4]

topic_columns = [col for col in data.columns if 'Topic' in col]

threshold = 1

objectives = []
final_list = []

for i in dataframes:

    X_train = i[i.columns.difference(topic_columns)]
    X_train = X_train.drop(columns=i.iloc[:,:1].columns[0])
    y_train = i.iloc[:,:1]

    select_feature = SelectKBest(f_regression, k=10).fit(X_train, y_train)

    selected_features_df = pd.DataFrame({'Feature':list(X_train.columns),
                                     'Scores':select_feature.scores_})
    
    features_list = selected_features_df[selected_features_df['Scores'] > threshold]['Feature'].tolist()

    objectives.append(i.iloc[:,:1].columns[0])
    final_list.append(features_list)

    print(i.iloc[:,:1].columns[0], features_list)

diff1 ['title_vader_sentiment']
diff2 ['headline_vader_sentiment', 'negative', 'negative_vader', 'neutral_vader']
diff3 ['average_title_vader_10periods', 'negative', 'negative_vader', 'neutral_vader']
diff4 ['average_title_vader_10periods', 'headline_vader_sentiment', 'negative', 'negative_vader', 'neutral_vader']
