# Process Data for final predictor training
>  Notebook to Train the final predictor model using the final train dataset

### Import Packages and functions

In [1]:
#Import required packages.
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
#Import custom defined objects
from modules import SetTransformer
from functions import preprocess_dataset, generate_topic_encodings, add_gold_price_change


### Load, Preprocess and Save Training Data

In [3]:
#Load the dataset
input_file = '../Data/Final_gold-dataset-sinha-khandait1_with_predictions.csv'

df_raw = pd.read_csv(input_file)
print("Number of rows in the dataset: ", df_raw.shape)
df_raw.head()

Number of rows in the dataset:  (11412, 7)


Unnamed: 0,date,text,label,predicted_sentiment,sentiment_confidence,sentiment_logits,sentiment_probabilities
0,2016-01-28,"april gold down 20 cents to settle at $1,116.1...",2,negative,0.9996,"[-2.5769472122192383, -2.57643723487854, 5.939...","{'positive': '0.0002', 'neutral': '0.0002', 'n..."
1,2017-09-13,gold suffers third straight daily decline,2,negative,0.9993,"[-2.310701847076416, -2.489659309387207, 5.520...","{'positive': '0.0004', 'neutral': '0.0003', 'n..."
2,2016-07-26,gold futures edge up after two-session decline,0,positive,0.9992,"[5.106618404388428, -2.7181293964385986, -2.73...","{'positive': '0.9992', 'neutral': '0.0004', 'n..."
3,2018-02-28,dent research : is gold's day in the sun comin...,1,neutral,0.9969,"[-2.3254199028015137, 4.0417585372924805, -2.5...","{'positive': '0.0017', 'neutral': '0.9969', 'n..."
4,2017-06-09,"gold snaps three-day rally as trump, lawmakers...",2,positive,0.9107,"[2.0869791507720947, -2.564009428024292, -0.33...","{'positive': '0.9107', 'neutral': '0.0087', 'n..."


In [4]:
#Preprocess the dataset
df_processed = preprocess_dataset(df_raw)
print("Number of rows after preprocessing: ", df_processed.shape)
df_processed.head()

Number of rows after preprocessing:  (11412, 4)


Unnamed: 0,date,text,label,sentiment
0,2016-01-28,"april gold down 20 cents to settle at $1,116.1...",2,-0.999
1,2017-09-13,gold suffers third straight daily decline,2,-0.998201
2,2016-07-26,gold futures edge up after two-session decline,0,0.998001
3,2018-02-28,dent research : is gold's day in the sun comin...,1,0.000299
4,2017-06-09,"gold snaps three-day rally as trump, lawmakers...",2,0.755972


In [5]:
#Add topic encodings
df_processed = generate_topic_encodings(df_processed)
df_processed.head()

Unnamed: 0,date,text,label,sentiment,topic_encodings
0,2016-01-28,"april gold down 20 cents to settle at $1,116.1...",2,-0.999,"[-0.030607987, -0.06112092, 0.05298826, 0.0025..."
1,2017-09-13,gold suffers third straight daily decline,2,-0.998201,"[0.012578344, -0.079677634, 0.050746847, 0.028..."
2,2016-07-26,gold futures edge up after two-session decline,0,0.998001,"[0.04919272, -0.07745917, 0.04010119, -0.06572..."
3,2018-02-28,dent research : is gold's day in the sun comin...,1,0.000299,"[0.022424512, 0.023190603, 0.030217161, 0.0060..."
4,2017-06-09,"gold snaps three-day rally as trump, lawmakers...",2,0.755972,"[0.05500786, -0.063086316, 0.006439179, -0.065..."


In [19]:
df_gold = pd.read_csv('../../Jaison/Model_Evaluation_Code/Data/GOLDBEES_ETF_price_data.csv')
df_gold.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2010-02-05,15.98,16.011,15.7385,15.765,2836000
1,2010-02-08,16.031,16.098,15.96,16.0625,445700
2,2010-02-09,16.065001,16.065001,15.96,15.9972,669100
3,2010-02-10,16.09,16.108999,16.0221,16.0609,335400
4,2010-02-11,16.099001,16.099001,16.0305,16.059299,385300


In [None]:
#Merge data with Gold data
final_df = add_gold_price_change(df_processed,df_gold)
final_df.head()

Unnamed: 0,Date,text,sentiment,topic_encodings,sentiment_combined_encodings,price_percentage_change
0,2016-01-28,"april gold down 20 cents to settle at $1,116.1...",-0.999,"[-0.030607987, -0.06112092, 0.05298826, 0.0025...","[0.030577388, 0.061059814, -0.052935287, -0.00...",0.191615
1,2016-01-28,gold sticks near 12-week high as fed eyes glob...,0.996703,"[0.016134372, -0.030435134, 0.05567852, 0.0271...","[0.016081171, -0.03033478, 0.05549493, 0.02705...",0.191615
2,2016-01-28,the biggest potential driver for gold prices i...,0.005663,"[0.054409396, -0.09339435, -0.0011068282, 0.01...","[0.0003081177, -0.00052888755, -6.2679132e-06,...",0.191615
3,2016-01-28,"buy comex gold if it touches $1,107-08/ounce",0.00259,"[-0.00877056, -0.017302277, 0.08381294, -0.038...","[-2.2712242e-05, -4.4805973e-05, 0.00021704198...",0.191615
4,2016-01-28,gold prices down slightly in asia with fed rat...,-0.999,"[0.007710825, -0.049220897, 0.0563185, 0.00629...","[-0.007703116, 0.04917169, -0.056262195, -0.00...",0.191615


In [None]:
#Check for rows with NANs
print("df rows with NANs:")
final_df.isnull().values.any()

df rows with NANs:


False

In [25]:
#Save processed_dataset for later use.
final_df.to_csv("../Data/combined_dataset_with_price_change.csv")