In [33]:
import numpy as np
import pandas as pd
import datetime


In [34]:
exchanges = {
    'binance':
        {
            'bitcoin':'BTC',
            'ethereum':'ETH',
            'litecoin':'LTC',
            'solana':'SOL',
            'ripple':'XRP',
            'dollar':'USDT',
        },
    'coinbase':
        {
            'bitcoin':'BTC',
            'ethereum':'ETH',
            'litecoin':'LTC',
            'solana':'SOL',
            #'ripple':'xrp',
            'dollar':'USD',
        }, 
    'ftx':
        {
            'bitcoin':'BTC',
            'ethereum':'ETH',
            'litecoin':'LTC',
            'solana':'SOL',
            'ripple':'XRP',
            'dollar':'USD',
        }, 
    'bitmex':
        {
            'bitcoin':'XBT',
            'ethereum':'ETH',
            'litecoin':'LTC',
            'solana':'SOL',
            'ripple':'XRP',
            'dollar':'USD',
        },
    'kraken':
        {
            'bitcoin':'XBT',
            'ethereum':'ETH',
            'litecoin':'LTC',
            'solana':'SOL',
            'ripple':'XRP',
            'dollar':'USD',
        },
    }

In [35]:
days = 6
date = datetime.datetime(2022, 2, 28)
dates = [(date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(days)]

## Load Data

### We use Binance for performance

In [36]:
exch = 'binance'
asset = 'ethereum'

# time frequency
fq = '10min'

# concatenate all date over the 6 day period into single dataframe resampled by frequency
ob_fq = pd.DataFrame()
for date_str in dates:
    try:
        path = 'tardis_raw.nosync/'+exch+'/'+exch+'_book_snapshot_5_'+date_str+'_'+exchanges[exch][asset]+exchanges[exch]['dollar']+'.csv.gz'
        ob = pd.read_csv(path, compression='gzip')
    except Exception as e:
        print(e)
        path = 'tardis_raw.nosync/'+exch+'/'+exch+'_book_snapshot_5_'+date_str+'_'+exchanges[exch][asset]+'USDT'+'.csv.gz'
        ob = pd.read_csv(path, compression='gzip')
        pass
    ob.sort_values(by='timestamp')
    ob['datetime'] = pd.to_datetime(ob['timestamp'], unit='us')
    ob_fq=ob_fq.append(ob.resample(fq, on='datetime', label='right').first())



In [37]:
# compute returns
ob_fq['mid_price'] = (ob_fq['asks[0].price'] + ob_fq['bids[0].price'])/2
ob_fq['returns'] = ob_fq['mid_price'].pct_change()
ob_fq['binary_returns'] = ob_fq['returns'].apply(lambda x: 1 if x>0 else 0)

## Load Tweets

In [38]:
tweets = pd.read_csv('tweets_stemmed_hashtags_no_stopwords.csv')
tweets['datetime'] = pd.to_datetime(tweets.timestamp, infer_datetime_format=True)
tweets = tweets.sort_values(by='datetime')

## Resample tweets with time frequency

In [39]:
# mask = (tweets['datetime'] > ob_fq['datetime'].iloc[5]) & (tweets['datetime'] <= ob_fq['datetime'].iloc[100])
# tweets_fq = tweets[mask]

tweets_fq = []
for i in range(ob_fq.shape[0]-1):
    tweets_one_period = []
    #get all tweets between 2 timestamps
    mask = (tweets['datetime'] > ob_fq['datetime'].iloc[i]) & (tweets['datetime'] <= ob_fq['datetime'].iloc[i+1])
    tweets_filtered = tweets[mask]
    if len(tweets_filtered)>0:
        # EST CE QUE LES TWEETS DE LA PERIODE PERMETTENT DE PREDIRE LA PERF DE LA PROCHAINE PERIODE
        try:
            dt = ob_fq['datetime'].iloc[i+2]
        except:
            dt = ob_fq['datetime'].iloc[i+1]
        tw = " ".join([tweet for tweet in tweets_filtered['tweet_stemmer_hashtags_no_stopwords']])
        tweets_fq.append([dt, tw])

tweets_fq = pd.DataFrame(tweets_fq, columns=['datetime', 'tweets'])



## Train Test split

In [40]:
from sklearn.model_selection import train_test_split

tweets_fq.reset_index(drop = True, inplace = True)
ob_fq.reset_index(drop = True, inplace = True)
full_data = pd.merge(tweets_fq, ob_fq, on='datetime')

df_train, df_test, y_train, y_test = train_test_split(full_data.drop(['binary_returns'], axis=1), full_data["binary_returns"], random_state=203, test_size=0.30)
n_train = y_train.shape[0]
n_test = y_test.shape[0]

## Vectorizing

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vect = vectorizer.fit(df_train["tweets"])
x_train = vect.transform(df_train["tweets"])

## Logistic Regression

In [42]:
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression

# Fit the model on train dataset
clf = LogisticRegression(penalty="none")
model = clf.fit(x_train, y_train)

# Make predictions on train dataset + validation set (K-fold)
pred_train = model.predict(x_train)
pred_val = cross_val_predict(clf, x_train, y_train, cv=5)

## Accuracy

In [43]:
from sklearn.metrics import accuracy_score, auc

print("--- train dataset results :")
train_acc = accuracy_score(y_train, pred_train)
cross_train = pd.crosstab(y_train, pred_train)
print(cross_train)
print("Accuracy = %s" % train_acc)

print("\n--- validation set results :")
val_acc = accuracy_score(y_train, pred_val)
cross_val = pd.crosstab(y_train, pred_val)
print(cross_val)
print("Accuracy = %s" % val_acc)

--- train dataset results :
col_0            0   1
binary_returns        
0               42   0
1                0  25
Accuracy = 1.0

--- validation set results :
col_0            0   1
binary_returns        
0               31  11
1               18   7
Accuracy = 0.5671641791044776


## Top and Flop words

In [44]:
def get_coeffs(model, vect):
    """
    Returns words / n_grams weights in ascending order.
    (param) model : trained scikit learn model.
    (param) vect : used count vectorizer.
    return: word-weight pairwise
    """
    words = vect.get_feature_names()
    coeffs = model.coef_.tolist()[0]
    coeff_df = pd.DataFrame({'word' : words, 
                        'coefficient' : coeffs})
    coeff_df = coeff_df.sort_values(['word', 'coefficient'])
    return coeff_df

df_coeff = get_coeffs(model=model, vect=vect)
df_sort = df_coeff.sort_values(["coefficient", "word"], ascending=False)
top_40_words = df_sort.head(40)["word"].tolist()
flop_40_words = df_sort.tail(40)["word"].tolist()
print(top_40_words)
print(flop_40_words)

['rbxsamurai', 'eth', 'blockchain', 'cryptocurrency', 'btc', 'tax', 'space', 'web3', 'cryptocurr', 'rbxs', 'xrp', 'rbx', 'gem', 'kibakrew', 'altseason', 'everrisev3', 'everriseedu', 'vektor', 'join', 'nft', 'bullish', 'digitalart', 'one', 'btt', 'matic', 'twitter', 'defi', 'take', 'metavers', 'gemanaliz', 'shibnobi_dojoswap', 'ama', 'market', 'kiba', 'nftdrop', 'cryptonews', 'blocvault', 'see', 'metaverse', 'saintpatricksday']
['mri', 'f1', 'cro', 'whale', 'stock', 'platform', 'guy', 'messag', 'pitbull', 'way', 'think', 'check', 'staking', 'exchang', 'safemoon', 'forex', 'kishu', 'ripple', 'ani', 'prepar', 'shib', 'year', 'kasta', 'thi', 'binanc', 'biggest', 'altcoin', 'get', 'good', 'best', 'nftartist', 'pump', 'coin', 'shibnobi', 'saitama', 'bitcoin', 'trade', 'shinja', 'crypto', 'floki']


