## News Sentiment Analysis and Crypto price prediction

In [None]:
# Book 10 - Dogecoin - XG Boost Regressor Model

In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from xgboost import XGBRegressor
%matplotlib inline
from sklearn import metrics

In [2]:
# Load the news sentiment data and closing prices for Dogecoin
xgbr_df = pd.read_csv('sentiment_closing_doge.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)
xgbr_df.tail()

Unnamed: 0_level_0,title_sent,text_sent,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-03-06,0.0,-1.0,0.125025,0.125611,0.120384,0.120766,0.120766,400458464.0
2022-03-07,-1.0,-1.0,0.120769,0.122563,0.115015,0.117105,0.117105,513014829.0
2022-03-08,0.0,0.0,0.117106,0.119724,0.115838,0.117029,0.117029,491414294.0
2022-03-09,0.0,-1.0,0.117047,0.123734,0.116766,0.121588,0.121588,519157507.0
2022-03-10,-1.0,0.0,,,,,,


In [33]:
# Choosing only the Relevant Columns and dropping the others
xgbr_df = xgbr_df.drop(columns=["Open", "High", "Low", "Adj Close", "text_sent"])

xgbr_df.head()

Unnamed: 0_level_0,title_sent,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-02-10,0.0,0.151889,1053631000.0
2022-02-11,0.0,0.144847,776730600.0
2022-02-12,0.0,0.144405,602699400.0
2022-02-13,0.0,0.148948,1581065000.0
2022-02-14,0.0,0.146003,898042700.0


In [34]:
# Repositioning the columns
xgbr_df = xgbr_df[["Close", "title_sent", "Volume"]]
xgbr_df.head()

Unnamed: 0_level_0,Close,title_sent,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-02-10,0.151889,0.0,1053631000.0
2022-02-11,0.144847,0.0,776730600.0
2022-02-12,0.144405,0.0,602699400.0
2022-02-13,0.148948,0.0,1581065000.0
2022-02-14,0.146003,0.0,898042700.0


In [35]:
# pct change based on close value
xgbr_df["Pct_change"] = xgbr_df["Close"].pct_change()
xgbr_df.head()

Unnamed: 0_level_0,Close,title_sent,Volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-02-10,0.151889,0.0,1053631000.0,
2022-02-11,0.144847,0.0,776730600.0,-0.046363
2022-02-12,0.144405,0.0,602699400.0,-0.003051
2022-02-13,0.148948,0.0,1581065000.0,0.03146
2022-02-14,0.146003,0.0,898042700.0,-0.019772


In [36]:
# Drop null values
xgbr_df.dropna(inplace = True)
xgbr_df.tail()

Unnamed: 0_level_0,Close,title_sent,Volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03-05,0.124996,-1.0,334091552.0,0.019618
2022-03-06,0.120766,0.0,400458464.0,-0.033841
2022-03-07,0.117105,-1.0,513014829.0,-0.030315
2022-03-08,0.117029,0.0,491414294.0,-0.000649
2022-03-09,0.121588,0.0,519157507.0,0.038956


### Creating the Features `X` and Target `y` Data

In [39]:
# This function "window_data" accepts the column number for the features (X) and the target (y)
# It chunks the data up with a rolling window of Xt-n to predict Xt
# It returns a numpy array of X any y
def window_data(df, window, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number):
    # Create empty lists "X_close", "title_sent", "Volume" and y
    X_close = []
    X_title_sent = []
    X_volume = []
    y = []
    for i in range(len(df) - window):
        
        # Get close, title_sent, Volume, and target in the loop
        Close = df.iloc[i:(i + window), feature_col_number1]
        title_sent = df.iloc[i:(i + window), feature_col_number2]
        Volume = df.iloc[i:(i + window), feature_col_number3]
        target = df.iloc[(i + window), target_col_number]
        
        # Append values in the lists
        X_close.append(Close)
        X_title_sent.append(title_sent)
        X_volume.append(Volume)
        y.append(target)
        
    return np.hstack((X_close,X_title_sent,X_volume)), np.array(y).reshape(-1, 1)

In [40]:
# Predict Closing Prices using a 3 day window of previous closing prices
window_size = 3

# Column index 0 is the `Close` column
# Column index 1 is the `title_sent` column
# Column index 2 is the `Volume` column
feature_col_number1 = 0
feature_col_number2 = 1
feature_col_number3 = 2
target_col_number = 0
X, y = window_data(xgbr_df, window_size, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number)

In [41]:
# Use 70% of the data for training and 30% for testing
X_split = int(0.7 * len(X))
y_split = int(0.7 * len(y))

# Set X_train, X_test, y_train, t_test
X_train = X[: X_split]
X_test = X[X_split:]
y_train = y[: y_split]
y_test = y[y_split:]

# Scaling Data with `MinMaxScaler`

We will use the `MinMaxScaler` from `sklearn` to scale all values between `0` and `1`.
Note that we scale both features and target sets.

In [42]:
from sklearn.preprocessing import MinMaxScaler

In [43]:
# Use the MinMaxScaler to scale data between 0 and 1.
x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()
y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

# Fit the scaler for the Training Data
x_train_scaler.fit(X_train)
y_train_scaler.fit(y_train)

# Scale the training data
X_train = x_train_scaler.transform(X_train)
y_train = y_train_scaler.transform(y_train)

# Fit the scaler for the Testing Data
x_test_scaler.fit(X_test)
y_test_scaler.fit(y_test)

# Scale the y_test data
X_test = x_test_scaler.transform(X_test)
y_test = y_test_scaler.transform(y_test)

In [44]:
# Create the XG Boost regressor instance
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)

In [45]:
# Fit the model
model.fit(X_train, y_train.ravel())

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

---

---

## Model Performance

In this section, we will evaluate the model using the test data. 

We will:
1. Evaluate the model using the `X_test` and `y_test` data.
2. Use the X_test data to make predictions
3. Create a DataFrame of Real (y_test) vs predicted values. 
4. Plot the Real vs predicted values as a line chart


In [46]:
# Make some predictions
predicted = model.predict(X_test)

In [47]:
# Evaluating the model
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
print('R-squared :', metrics.r2_score(y_test, predicted))

Root Mean Squared Error: 0.2466110859515458
R-squared : 0.4470051306651013


In [48]:
# Recover the original prices instead of the scaled version
predicted_prices = y_test_scaler.inverse_transform(predicted.reshape(-1, 1))
real_prices = y_test_scaler.inverse_transform(y_test.reshape(-1, 1))

In [49]:
# Create a DataFrame of Real and Predicted values
crypto = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
}, index = xgbr_df.index[-len(real_prices): ]) 
xgbr_df.head()

Unnamed: 0_level_0,Close,title_sent,Volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-02-11,0.144847,0.0,776730600.0,-0.046363
2022-02-12,0.144405,0.0,602699400.0,-0.003051
2022-02-13,0.148948,0.0,1581065000.0,0.03146
2022-02-14,0.146003,0.0,898042700.0,-0.019772
2022-02-15,0.151761,1.0,674961500.0,0.039438


In [50]:
# Plot the real vs predicted values as a line chart
crypto.hvplot(title = "Real vs Predicted values of Dogecoin")