In [13]:
import os
import pandas as pd
import numpy as np
import spacy
import pathlib
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
# BASE_PATH = '/content/drive/MyDrive/Cryptofuture'
BASE_PATH = './api/analysis'

In [4]:
# Model

class CnnModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.vocab_size = 6002
        self.embed_size = 100
        self.num_filters = 100
        self.filter_sizes = [3, 4, 5]
        self.output_classes = 2
        self.dropout = 0.8

        # Embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)

        # Convolutional layer
        self.convs = nn.ModuleList([
                                    nn.Conv2d(
                                        in_channels=1, 
                                        out_channels=self.num_filters,
                                        kernel_size=(fs, self.embed_size)) 
                                    for fs in self.filter_sizes
        ])

        # Fully connected layer
        self.fc = nn.Linear(len(self.filter_sizes) * self.num_filters, self.output_classes)

        # Dropout layer
        self.dropout = nn.Dropout(self.dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

    
model = CnnModel().to(device)
model.load_state_dict(torch.load(os.path.join(BASE_PATH, 'cnn_model.pt'), map_location=device))

<All keys matched successfully>

In [6]:
with open(os.path.join(BASE_PATH, 'vocab.pickle'), 'rb') as t:
  text = pickle.load(t)

nlp = spacy.load("en_core_web_sm")

def positive_sentiment(tweet, model=model, min_length=10):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)]
    if len(tokenized) < min_length:
        tokenized += ["<pad>"] * (min_length - len(tokenized))
    indexed = [text[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.softmax(model(tensor, len(tensor)), dim=1)
    prediction = torch.squeeze(prediction)
    return prediction[1].item()


def negative_sentiment(tweet, model=model, min_length=10):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)]
    if len(tokenized) < min_length:
        tokenized += ["<pad>"] * (min_length - len(tokenized))
    indexed = [text[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.softmax(model(tensor, len(tensor)), dim=1)
    prediction = torch.squeeze(prediction)
    return prediction[0].item()

In [14]:
tweets_dataset = os.path.join(BASE_PATH, 'tweets.csv')

df = pd.read_csv(tweets_dataset)

tweets_df_dropped = df[['date', 'tweet', 'likes_count', 'retweets_count']]
# tweets_df_dropped

Unnamed: 0,date,tweet,likes_count,retweets_count
0,2021-05-19,#bitcoin Miami conf. https://t.co/rbAit475V3,70,6
1,2021-05-19,Bitcoin soundtrack by Raça Negra,66,9
2,2021-05-19,When you’ve officially run out of fiat to buy ...,133,18
3,2021-05-19,#bitcoin REMINDER 👇 During 2015-2017 $BTC bul...,118,29
4,2021-05-19,LIKE IF YOU HELD STRONG!! RETWEET IF YOU BOUG...,104,31
...,...,...,...,...
10341,2021-05-15,"Days after sending Dogecoin sliding, Musk anno...",34,16
10342,2021-05-15,#Coin olarak yarın akşam 8 adet yazacağım heps...,726,18
10343,2021-05-15,Some advice for @elonmusk &amp; @Tesla. You s...,122,23
10344,2021-05-15,I told you to buy $LINK at $7. I told you $LTC...,169,29


In [17]:
tweets_df_dropped["pos_sentiment"] = tweets_df_dropped.tweet.apply(lambda x: positive_sentiment(x))
tweets_df_dropped["neg_sentiment"] = tweets_df_dropped.tweet.apply(lambda x: negative_sentiment(x))

# tweets_df_dropped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,date,tweet,likes_count,retweets_count,pos_sentiment,neg_sentiment
0,2021-05-19,#bitcoin Miami conf. https://t.co/rbAit475V3,70,6,0.641866,0.358134
1,2021-05-19,Bitcoin soundtrack by Raça Negra,66,9,0.842248,0.157752
2,2021-05-19,When you’ve officially run out of fiat to buy ...,133,18,0.457093,0.542907
3,2021-05-19,#bitcoin REMINDER 👇 During 2015-2017 $BTC bul...,118,29,0.258801,0.741199
4,2021-05-19,LIKE IF YOU HELD STRONG!! RETWEET IF YOU BOUG...,104,31,0.804997,0.195003
...,...,...,...,...,...,...
10341,2021-05-15,"Days after sending Dogecoin sliding, Musk anno...",34,16,0.404322,0.595678
10342,2021-05-15,#Coin olarak yarın akşam 8 adet yazacağım heps...,726,18,0.555643,0.444357
10343,2021-05-15,Some advice for @elonmusk &amp; @Tesla. You s...,122,23,0.706393,0.293607
10344,2021-05-15,I told you to buy $LINK at $7. I told you $LTC...,169,29,0.761905,0.238095


In [20]:
tweets_df_dropped.to_csv(os.path.join(BASE_PATH, 'tweets_sentiments.csv'), index=False)

In [18]:
gp = tweets_df_dropped.groupby(tweets_df_dropped.date)

# len(gp), len(tweets_df_dropped.date.unique())

(5, 5)

In [19]:
final = []

column_names = [
                'date', 'pos_likes_count', 'neg_likes_count', 'pos_retweets_count', 
                'neg_retweets_count', 'sum_pos_sen', 
                'sum_neg_sen', 'mean_pos_sen', 'mean_neg_sen',
                'median_pos_sen', 'median_neg_sen', 'std_pos_sen', 'std_neg_sen',
                'skew_pos_sen', 'skew_neg_sen', 'kurtosis_pos_sen', 'kurtosis_neg_sen',
                'tweet_vol'
]

for date in tweets_df_dropped.date.unique()[::-1]:

    gp_df = gp.get_group(date)

    pos_gp_df = gp_df[gp_df['pos_sentiment'] > gp_df['neg_sentiment']]
    neg_gp_df = gp_df[gp_df['neg_sentiment'] > gp_df['pos_sentiment']]

    date = date

    pos_likes_count = pos_gp_df.likes_count.sum()
    neg_likes_count = neg_gp_df.likes_count.sum()

    pos_retweets_count = pos_gp_df.retweets_count.sum()
    neg_retweets_count = neg_gp_df.retweets_count.sum()

    sum_pos_sen = pos_gp_df.pos_sentiment.sum()
    sum_neg_sen = neg_gp_df.neg_sentiment.sum()

    mean_pos_sen = pos_gp_df.pos_sentiment.mean()
    mean_neg_sen = neg_gp_df.neg_sentiment.mean()

    median_pos_sen = pos_gp_df.pos_sentiment.median()
    median_neg_sen = neg_gp_df.neg_sentiment.median()

    std_pos_sen = pos_gp_df.pos_sentiment.std()
    std_neg_sen = neg_gp_df.neg_sentiment.std()

    skew_pos_sen = pos_gp_df.pos_sentiment.skew()
    skew_neg_sen = neg_gp_df.neg_sentiment.skew()

    kurtosis_pos_sen = pos_gp_df.pos_sentiment.kurtosis()
    kurtosis_neg_sen = neg_gp_df.neg_sentiment.kurtosis()

    tweet_vol = len(gp_df)

    final.append([
                  date, pos_likes_count, neg_likes_count, pos_retweets_count, 
                  neg_retweets_count, sum_pos_sen,
                  sum_neg_sen, mean_pos_sen, mean_neg_sen,
                  median_pos_sen, median_neg_sen, std_pos_sen, std_neg_sen,
                  skew_pos_sen, skew_neg_sen, kurtosis_pos_sen, kurtosis_neg_sen,
                  tweet_vol])

df = pd.DataFrame(final, columns=column_names)

# df

Unnamed: 0,date,pos_likes_count,neg_likes_count,pos_retweets_count,neg_retweets_count,sum_pos_sen,sum_neg_sen,mean_pos_sen,mean_neg_sen,median_pos_sen,median_neg_sen,std_pos_sen,std_neg_sen,skew_pos_sen,skew_neg_sen,kurtosis_pos_sen,kurtosis_neg_sen,tweet_vol
0,2021-05-15,85233,28983,16647,4403,102.970406,36.584523,0.730287,0.665173,0.738183,0.667719,0.135558,0.109282,0.011508,0.480412,-1.201264,-0.520526,196
1,2021-05-16,522563,332178,119287,41952,687.413068,326.498656,0.723593,0.690272,0.718551,0.674827,0.132703,0.128817,0.168048,0.435201,-1.037772,-0.82312,1423
2,2021-05-17,1147992,814476,189001,106938,1552.221224,813.396563,0.720288,0.688153,0.713019,0.672298,0.125332,0.127309,0.171279,0.401858,-0.994159,-0.90058,3337
3,2021-05-18,585136,269283,112971,39374,978.543638,430.645039,0.725923,0.685741,0.723166,0.670079,0.129661,0.125301,0.158011,0.41873,-1.001836,-0.820806,1976
4,2021-05-19,851546,465392,156906,61359,1586.590648,814.883092,0.705779,0.698871,0.697125,0.687135,0.120695,0.129969,0.304487,0.301695,-0.807483,-0.966413,3414


In [22]:
prices_df = pd.read_csv(os.path.join(BASE_PATH, 'prices.csv'))

prices_df

Unnamed: 0,DateTime,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6
0,2021-05-19 00:00:00,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6
1,2021-05-18 00:00:00,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08
2,2021-05-17 00:00:00,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52
3,2021-05-16 00:00:00,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01
4,2021-05-15 00:00:00,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18
5,2021-05-14 00:00:00,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18,58943.13
6,2021-05-13 00:00:00,49700.6,49504.08,56747.52,55870.01,58294.18,58943.13,57374.33
7,2021-05-12 00:00:00,49504.08,56747.52,55870.01,58294.18,58943.13,57374.33,56441.94
8,2021-05-11 00:00:00,56747.52,55870.01,58294.18,58943.13,57374.33,56441.94,57506.88
9,2021-05-10 00:00:00,55870.01,58294.18,58943.13,57374.33,56441.94,57506.88,53244.07


In [30]:
prices_df_subset = prices_df.loc[0: 4].reset_index(drop=True)

prices_df_subset

Unnamed: 0,DateTime,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6
0,2021-05-19 00:00:00,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6
1,2021-05-18 00:00:00,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08
2,2021-05-17 00:00:00,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52
3,2021-05-16 00:00:00,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01
4,2021-05-15 00:00:00,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18


In [33]:
df_subset = df[::-1].reset_index(drop=True)

df_subset

Unnamed: 0,date,pos_likes_count,neg_likes_count,pos_retweets_count,neg_retweets_count,sum_pos_sen,sum_neg_sen,mean_pos_sen,mean_neg_sen,median_pos_sen,median_neg_sen,std_pos_sen,std_neg_sen,skew_pos_sen,skew_neg_sen,kurtosis_pos_sen,kurtosis_neg_sen,tweet_vol
0,2021-05-19,851546,465392,156906,61359,1586.590648,814.883092,0.705779,0.698871,0.697125,0.687135,0.120695,0.129969,0.304487,0.301695,-0.807483,-0.966413,3414
1,2021-05-18,585136,269283,112971,39374,978.543638,430.645039,0.725923,0.685741,0.723166,0.670079,0.129661,0.125301,0.158011,0.41873,-1.001836,-0.820806,1976
2,2021-05-17,1147992,814476,189001,106938,1552.221224,813.396563,0.720288,0.688153,0.713019,0.672298,0.125332,0.127309,0.171279,0.401858,-0.994159,-0.90058,3337
3,2021-05-16,522563,332178,119287,41952,687.413068,326.498656,0.723593,0.690272,0.718551,0.674827,0.132703,0.128817,0.168048,0.435201,-1.037772,-0.82312,1423
4,2021-05-15,85233,28983,16647,4403,102.970406,36.584523,0.730287,0.665173,0.738183,0.667719,0.135558,0.109282,0.011508,0.480412,-1.201264,-0.520526,196


In [34]:
final_df = pd.concat([df_subset, prices_df_subset], axis=1)

final_df

Unnamed: 0,date,pos_likes_count,neg_likes_count,pos_retweets_count,neg_retweets_count,sum_pos_sen,sum_neg_sen,mean_pos_sen,mean_neg_sen,median_pos_sen,median_neg_sen,std_pos_sen,std_neg_sen,skew_pos_sen,skew_neg_sen,kurtosis_pos_sen,kurtosis_neg_sen,tweet_vol,DateTime,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6
0,2021-05-19,851546,465392,156906,61359,1586.590648,814.883092,0.705779,0.698871,0.697125,0.687135,0.120695,0.129969,0.304487,0.301695,-0.807483,-0.966413,3414,2021-05-19 00:00:00,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6
1,2021-05-18,585136,269283,112971,39374,978.543638,430.645039,0.725923,0.685741,0.723166,0.670079,0.129661,0.125301,0.158011,0.41873,-1.001836,-0.820806,1976,2021-05-18 00:00:00,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08
2,2021-05-17,1147992,814476,189001,106938,1552.221224,813.396563,0.720288,0.688153,0.713019,0.672298,0.125332,0.127309,0.171279,0.401858,-0.994159,-0.90058,3337,2021-05-17 00:00:00,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52
3,2021-05-16,522563,332178,119287,41952,687.413068,326.498656,0.723593,0.690272,0.718551,0.674827,0.132703,0.128817,0.168048,0.435201,-1.037772,-0.82312,1423,2021-05-16 00:00:00,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01
4,2021-05-15,85233,28983,16647,4403,102.970406,36.584523,0.730287,0.665173,0.738183,0.667719,0.135558,0.109282,0.011508,0.480412,-1.201264,-0.520526,196,2021-05-15 00:00:00,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18


In [39]:
final_df.drop(['DateTime'], axis=1)

Unnamed: 0,date,pos_likes_count,neg_likes_count,pos_retweets_count,neg_retweets_count,sum_pos_sen,sum_neg_sen,mean_pos_sen,mean_neg_sen,median_pos_sen,median_neg_sen,std_pos_sen,std_neg_sen,skew_pos_sen,skew_neg_sen,kurtosis_pos_sen,kurtosis_neg_sen,tweet_vol,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6
0,2021-05-19,851546,465392,156906,61359,1586.590648,814.883092,0.705779,0.698871,0.697125,0.687135,0.120695,0.129969,0.304487,0.301695,-0.807483,-0.966413,3414,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6
1,2021-05-18,585136,269283,112971,39374,978.543638,430.645039,0.725923,0.685741,0.723166,0.670079,0.129661,0.125301,0.158011,0.41873,-1.001836,-0.820806,1976,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08
2,2021-05-17,1147992,814476,189001,106938,1552.221224,813.396563,0.720288,0.688153,0.713019,0.672298,0.125332,0.127309,0.171279,0.401858,-0.994159,-0.90058,3337,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52
3,2021-05-16,522563,332178,119287,41952,687.413068,326.498656,0.723593,0.690272,0.718551,0.674827,0.132703,0.128817,0.168048,0.435201,-1.037772,-0.82312,1423,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01
4,2021-05-15,85233,28983,16647,4403,102.970406,36.584523,0.730287,0.665173,0.738183,0.667719,0.135558,0.109282,0.011508,0.480412,-1.201264,-0.520526,196,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18


In [40]:
final_df['zscore_sum_pos_sen'] = (final_df['sum_pos_sen'] - final_df['sum_pos_sen'].mean()) / final_df['sum_pos_sen'].std(ddof=0) 
final_df['zscore_sum_neg_sen'] = (final_df['sum_neg_sen'] - final_df['sum_neg_sen'].mean()) / final_df['sum_neg_sen'].std(ddof=0)

final_df['zscore_mean_pos_sen'] = (final_df['mean_pos_sen'] - final_df['mean_pos_sen'].mean()) / final_df['mean_pos_sen'].std(ddof=0) 
final_df['zscore_mean_neg_sen'] = (final_df['mean_neg_sen'] - final_df['mean_neg_sen'].mean()) / final_df['mean_neg_sen'].std(ddof=0)

final_df['zscore_median_pos_sen'] = (final_df['median_pos_sen'] - final_df['median_pos_sen'].mean()) / final_df['median_pos_sen'].std(ddof=0) 
final_df['zscore_median_neg_sen'] = (final_df['median_neg_sen'] - final_df['median_neg_sen'].mean()) / final_df['median_neg_sen'].std(ddof=0)

final_df['zscore_std_pos_sen'] = (final_df['std_pos_sen'] - final_df['std_pos_sen'].mean()) / final_df['std_pos_sen'].std(ddof=0) 
final_df['zscore_std_neg_sen'] = (final_df['std_neg_sen'] - final_df['std_neg_sen'].mean()) / final_df['std_neg_sen'].std(ddof=0)

final_df['zscore_skew_pos_sen'] = (final_df['skew_pos_sen'] - final_df['skew_pos_sen'].mean()) / final_df['skew_pos_sen'].std(ddof=0) 
final_df['zscore_skew_neg_sen'] = (final_df['skew_neg_sen'] - final_df['skew_neg_sen'].mean()) / final_df['skew_neg_sen'].std(ddof=0)

final_df['zscore_kurtosis_pos_sen'] = (final_df['kurtosis_pos_sen'] - final_df['kurtosis_pos_sen'].mean()) / final_df['kurtosis_pos_sen'].std(ddof=0) 
final_df['zscore_kurtosis_neg_sen'] = (final_df['kurtosis_neg_sen'] - final_df['kurtosis_neg_sen'].mean()) / final_df['kurtosis_neg_sen'].std(ddof=0)

final_df

Unnamed: 0,date,pos_likes_count,neg_likes_count,pos_retweets_count,neg_retweets_count,sum_pos_sen,sum_neg_sen,mean_pos_sen,mean_neg_sen,median_pos_sen,median_neg_sen,std_pos_sen,std_neg_sen,skew_pos_sen,skew_neg_sen,kurtosis_pos_sen,kurtosis_neg_sen,tweet_vol,DateTime,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6,zscore_sum_pos_sen,zscore_sum_neg_sen,zscore_mean_pos_sen,zscore_mean_neg_sen,zscore_median_pos_sen,zscore_median_neg_sen,zscore_std_pos_sen,zscore_std_neg_sen,zscore_skew_pos_sen,zscore_skew_neg_sen,zscore_kurtosis_pos_sen,zscore_kurtosis_neg_sen
0,2021-05-19,851546,465392,156906,61359,1586.590648,814.883092,0.705779,0.698871,0.697125,0.687135,0.120695,0.129969,0.304487,0.301695,-0.807483,-0.966413,3414,2021-05-19 00:00:00,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,1.08663,1.106759,-1.842068,1.186335,-1.560998,1.875622,-1.53328,0.76865,1.527753,-1.793045,1.60255,-1.048171
1,2021-05-18,585136,269283,112971,39374,978.543638,430.645039,0.725923,0.685741,0.723166,0.670079,0.129661,0.125301,0.158011,0.41873,-1.001836,-0.820806,1976,2021-05-18 00:00:00,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,-0.005395,-0.180027,0.568211,0.008852,0.385497,-0.638724,0.164958,0.153561,-0.050158,0.188824,0.053149,-0.095032
2,2021-05-17,1147992,814476,189001,106938,1552.221224,813.396563,0.720288,0.688153,0.713019,0.672298,0.125332,0.127309,0.171279,0.401858,-0.994159,-0.90058,3337,2021-05-17 00:00:00,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,1.024904,1.10178,-0.105954,0.225179,-0.372975,-0.311514,-0.654918,0.418091,0.092773,-0.096882,0.11435,-0.617233
3,2021-05-16,522563,332178,119287,41952,687.413068,326.498656,0.723593,0.690272,0.718551,0.674827,0.132703,0.128817,0.168048,0.435201,-1.037772,-0.82312,1423,2021-05-16 00:00:00,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,-0.528253,-0.528805,0.289433,0.415229,0.040523,0.06117,0.741273,0.616821,0.05797,0.467752,-0.233337,-0.110176
4,2021-05-15,85233,28983,16647,4403,102.970406,36.584523,0.730287,0.665173,0.738183,0.667719,0.135558,0.109282,0.011508,0.480412,-1.201264,-0.520526,196,2021-05-15 00:00:00,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18,-1.577886,-1.499707,1.090378,-1.835595,1.507954,-0.986555,1.281967,-1.957122,-1.628339,1.233352,-1.536713,1.870612


In [41]:
final_df = final_df[['Price Lag 0', 'Price Lag 1', 'Price Lag 2', 'Price Lag 3', 
                   'Price Lag 4', 'Price Lag 5', 'Price Lag 6', 'zscore_sum_pos_sen',
                   'zscore_sum_neg_sen', 'tweet_vol', 'pos_likes_count',
               'pos_retweets_count', 'neg_likes_count', 'neg_retweets_count']]

final_df

Unnamed: 0,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6,zscore_sum_pos_sen,zscore_sum_neg_sen,tweet_vol,pos_likes_count,pos_retweets_count,neg_likes_count,neg_retweets_count
0,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,1.08663,1.106759,3414,851546,156906,465392,61359
1,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,-0.005395,-0.180027,1976,585136,112971,269283,39374
2,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,1.024904,1.10178,3337,1147992,189001,814476,106938
3,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,-0.528253,-0.528805,1423,522563,119287,332178,41952
4,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18,-1.577886,-1.499707,196,85233,16647,28983,4403


In [43]:
final_df.to_csv(os.path.join(BASE_PATH, 'final_dataset_time_series.csv'), index=False)

#Time series Prediction

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
from keras.preprocessing.sequence import TimeseriesGenerator

In [48]:
df = pd.read_csv(os.path.join(BASE_PATH, 'final_dataset_time_series.csv'))

input_df = df[::-1].reset_index(drop=True)
# input_df

Unnamed: 0,Price Lag 0,Price Lag 1,Price Lag 2,Price Lag 3,Price Lag 4,Price Lag 5,Price Lag 6,zscore_sum_pos_sen,zscore_sum_neg_sen,tweet_vol,pos_likes_count,pos_retweets_count,neg_likes_count,neg_retweets_count
0,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,58294.18,-1.577886,-1.499707,196,85233,16647,28983,4403
1,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,55870.01,-0.528253,-0.528805,1423,522563,119287,332178,41952
2,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,56747.52,1.024904,1.10178,3337,1147992,189001,814476,106938
3,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,49504.08,-0.005395,-0.180027,1976,585136,112971,269283,39374
4,37421.38,42886.02,43552.14,46490.71,46775.29,49887.96,49700.6,1.08663,1.106759,3414,851546,156906,465392,61359


In [49]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(input_df)

# scaled_data

array([[1.        , 1.        , 0.97042845, 0.88700272, 1.        ,
        0.87885452, 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.96957636, 0.55545606, 1.        , 0.94484951, 0.2736389 ,
        1.        , 0.72421588, 0.39393009, 0.37249732, 0.38129273,
        0.4115044 , 0.59551853, 0.38599326, 0.36620666],
       [0.65542217, 0.51481304, 0.50871868, 1.        , 0.29334562,
        0.        , 0.82404523, 0.97683408, 0.99809003, 0.97607209,
        1.        , 1.        , 1.        , 1.        ],
       [0.58420917, 0.09513363, 0.46380263, 0.08376775, 0.3121338 ,
        0.02713076, 0.        , 0.59015994, 0.50631021, 0.5531386 ,
        0.47038228, 0.55887302, 0.30592252, 0.34106403],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05299692, 0.02235697, 1.        , 1.        , 1.        ,
        0.72106   , 0.81378442, 0.55558611, 0.55547862]])

In [50]:
features = scaled_data
target = scaled_data[:, 0]

# target

array([1.        , 0.96957636, 0.65542217, 0.58420917, 0.        ])

In [51]:
model = tf.keras.models.load_model(os.path.join(BASE_PATH, 'model.h5'))

model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_28 (LSTM)               (None, 3, 256)            277504    
_________________________________________________________________
leaky_re_lu_19 (LeakyReLU)   (None, 3, 256)            0         
_________________________________________________________________
lstm_29 (LSTM)               (None, 3, 128)            197120    
_________________________________________________________________
leaky_re_lu_20 (LeakyReLU)   (None, 3, 128)            0         
_________________________________________________________________
dropout_18 (Dropout)         (None, 3, 128)            0         
_________________________________________________________________
lstm_30 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dropout_19 (Dropout)         (None, 64)               

In [52]:
all_generator = TimeseriesGenerator(
    features, target, length=3, sampling_rate=1, 
)

In [53]:
# all_generator[0][1]

array([0.58420917, 0.        ])

In [65]:
predictions = model.predict(all_generator)

predictions

array([[0.8101212],
       [0.698583 ]], dtype=float32)

In [67]:
df_pred = pd.concat(
    [pd.DataFrame(predictions), 
    pd.DataFrame(features[:, 1:][3:])], axis=1)

df_pred

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,9,10,11,12
0,0.810121,0.095134,0.463803,0.083768,0.312134,0.027131,0.0,0.59016,0.50631,0.553139,0.470382,0.558873,0.305923,0.341064
1,0.698583,0.0,0.0,0.0,0.0,0.052997,0.022357,1.0,1.0,1.0,0.72106,0.813784,0.555586,0.555479


In [68]:
rev_trans = scaler.inverse_transform(df_pred)

rev_trans

array([[ 4.49991806e+04,  4.35521400e+04,  4.64907100e+04,
         4.67752900e+04,  4.98879600e+04,  4.97006000e+04,
         4.95040800e+04, -5.39533667e-03, -1.80026740e-01,
         1.97600000e+03,  5.85136000e+05,  1.12971000e+05,
         2.69283000e+05,  3.93740000e+04],
       [ 4.39558626e+04,  4.28860200e+04,  4.35521400e+04,
         4.64907100e+04,  4.67752900e+04,  4.98879600e+04,
         4.97006000e+04,  1.08663033e+00,  1.10675863e+00,
         3.41400000e+03,  8.51546000e+05,  1.56906000e+05,
         4.65392000e+05,  6.13590000e+04]])

In [79]:
df_final= input_df[predictions.shape[0]*-1:]
df_final['prediction'] = rev_trans[:, 0]

df_final[['prediction', 'Price Lag 0']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,prediction,Price Lag 0
3,44999.180594,42886.02
4,43955.862574,37421.38
