In [4]:
import psycopg2
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import re

In [5]:
pd.set_option('display.max_rows', 200)

In [18]:
raw_perf_df = pd.read_csv('../data/performances.csv');
news_df = pd.read_csv('../data/news.csv');

perf_df = raw_perf_df.join(news_df.set_index('url'), on='url')
raw_perf_df.size
perf_df.head(10)

                                                 url strategy     symbol  \
0  https://binance.zendesk.com/hc/en-us/articles/...  highest    BNBUSDT   
1  https://binance.zendesk.com/hc/en-us/articles/...  highest    ONEUSDT   
2  https://binance.zendesk.com/hc/en-us/articles/...  highest  THETAUSDT   
3  https://binance.zendesk.com/hc/en-us/articles/...  highest    LSKUSDT   
4  https://binance.zendesk.com/hc/en-us/articles/...  highest    XMRUSDT   

   performance           extractor  
0     0.506597             onlyBnb  
1     0.232900  relatedAgainstUsdt  
2     0.438600  relatedAgainstUsdt  
3     0.165701  relatedAgainstUsdt  
4    -0.004999  relatedAgainstUsdt  
    id                                              title  \
0  324             Notice of Temporary System Maintenance   
1  325  Binance Futures Will Launch QTUM/USDT Perpetua...   
2  326  Binance Adds Additional Trading Pairs for STRA...   
3  327                 IOTX Competition Has Now Concluded   
4  328  Binance 

Unnamed: 0,url,strategy,symbol,performance,extractor,id,title,time,content
0,https://binance.zendesk.com/hc/en-us/articles/...,highest,BNBUSDT,0.506597,onlyBnb,142,Binance Lists Second BEP2 Community Listing Pr...,2019-08-23 13:41:05+00,"Fellow Binancians,\nFor the second month of th..."
1,https://binance.zendesk.com/hc/en-us/articles/...,highest,ONEUSDT,0.2329,relatedAgainstUsdt,250,Binance Has Distributed the Second Batch of St...,2020-04-20 11:29:57+00,"Fellow Binancians,\nBinance has completed the ..."
2,https://binance.zendesk.com/hc/en-us/articles/...,highest,THETAUSDT,0.4386,relatedAgainstUsdt,250,Binance Has Distributed the Second Batch of St...,2020-04-20 11:29:57+00,"Fellow Binancians,\nBinance has completed the ..."
3,https://binance.zendesk.com/hc/en-us/articles/...,highest,LSKUSDT,0.165701,relatedAgainstUsdt,250,Binance Has Distributed the Second Batch of St...,2020-04-20 11:29:57+00,"Fellow Binancians,\nBinance has completed the ..."
4,https://binance.zendesk.com/hc/en-us/articles/...,highest,XMRUSDT,-0.004999,relatedAgainstUsdt,258,Binance Savings Adds IOST and XMR to Flexible ...,2020-04-16 09:26:56+00,"Fellow Binancians,\nBinance Savings has added ..."
5,https://binance.zendesk.com/hc/en-us/articles/...,highest,BTCUSDT,4.33717,relatedAgainstUsdt,259,Binance Will List Rupiah Token (IDRT),2020-04-16 06:09:44+00,"Fellow Binancians,\nBinance will list Rupiah T..."
6,https://binance.zendesk.com/hc/en-us/articles/...,highest,BUSDUSDT,-0.179978,relatedAgainstUsdt,259,Binance Will List Rupiah Token (IDRT),2020-04-16 06:09:44+00,"Fellow Binancians,\nBinance will list Rupiah T..."
7,https://binance.zendesk.com/hc/en-us/articles/...,highest,BNBUSDT,6.273633,relatedAgainstUsdt,210,Introducing the Cartesi (CTSI) Token Sale on B...,2020-04-13 09:53:13+00,"Fellow Binancians,\nBinance is excited to anno..."
8,https://binance.zendesk.com/hc/en-us/articles/...,highest,TROYUSDT,-0.2,relatedAgainstUsdt,250,Binance Has Distributed the Second Batch of St...,2020-04-20 11:29:57+00,"Fellow Binancians,\nBinance has completed the ..."
9,https://binance.zendesk.com/hc/en-us/articles/...,highest,CTSIUSDT,,relatedAgainstUsdt,213,"Play Cartesi’s Creepts Game to Enter a $50,000...",2020-04-16 01:56:22+00,With Cartesi being announced as the next proje...


In [19]:
meaningless_words = ['to', 'and', 'will', 'of', 'the', 'for', 'on', '', 'by', 'a', 'an', 'as', 'in', 'be', 'has', 'have']
def format_title(title):
    clean_title = re.sub('[^A-Za-z0-9 ]+', ' ', title.lower())
    without_duplicate_space_title = re.sub(' +', ' ',clean_title)
    words = clean_title.split(' ')
    uniq_words = list(set(words))
    meaningful_words = [word for word in uniq_words if word not in meaningless_words]
    return meaningful_words

In [20]:
def mean_5_bests(series):
    """Take the average of the 5 bests element of the serie"""
    if len(series) == 0:
        return null
    bests = sorted(series)[-5:]
    return sum(bests)/len(bests)

assert mean_5_bests([5, 3, 4, 2, 1, 3, 3, 3, 3]) == (5+4+3+3+3)/5

In [21]:
word_df = perf_df.copy()
word_df['words'] = word_df['title'].apply(format_title)

In [22]:
words_exploded_df = word_df \
    .explode('words') \
    .groupby(['words', 'extractor', 'strategy', 'symbol']) \
    .agg(
        count=('words', 'count'), 
        computed_perf=('performance', 'mean')
    ).sort_values(['computed_perf'], ascending=True)

In [23]:
words_exploded_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,computed_perf
words,extractor,strategy,symbol,Unnamed: 4_level_1,Unnamed: 5_level_1
trading,relatedAgainstUsdt,follower10,DENTUSDT,2,-11.803651
pairs,relatedAgainstUsdt,follower10,DENTUSDT,2,-11.803651
removal,relatedAgainstUsdt,follower10,DENTUSDT,1,-11.803651
notice,relatedAgainstUsdt,follower10,DENTUSDT,1,-11.803651
notice,relatedAgainstUsdt,follower10,KEYUSDT,1,-11.734025
03,relatedAgainstUsdt,follower10,KEYUSDT,1,-11.734025
removal,relatedAgainstUsdt,follower10,KEYUSDT,1,-11.734025
04,relatedAgainstUsdt,follower10,KEYUSDT,1,-11.734025
trading,relatedAgainstUsdt,follower10,KEYUSDT,2,-11.734025
2020,relatedAgainstUsdt,follower10,KEYUSDT,1,-11.734025


In [20]:
array = words_exploded_df.reset_index()
strategyFilter = array['strategy'].str.startswith('charly_')
extractorFilter =  array['extractor'] == 'relatedAgainstUsdt'
countFilter = array['count'] > 2
filtered_array = array[
    strategyFilter & 
    extractorFilter &
    countFilter
]\
.groupby(['words', 'extractor', 'strategy', 'symbol'])\
.agg(
        count=('count', 'sum'), 
        computed_perf=('computed_perf', 'mean')
    )\
.sort_values(['computed_perf'], ascending=False)
filtered_array.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,computed_perf
words,extractor,strategy,symbol,Unnamed: 4_level_1,Unnamed: 5_level_1
binance,relatedAgainstUsdt,charly_S_30W15L5,TFUELUSDT,3,4.498974
batch,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
rewards,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
staking,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
2020,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
distributed,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
second,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,3,3.642558
binance,relatedAgainstUsdt,charly_S_10W15L5,TFUELUSDT,4,3.493854
batch,relatedAgainstUsdt,charly_S_10W15L5,THETAUSDT,3,3.482438
second,relatedAgainstUsdt,charly_S_10W15L5,THETAUSDT,3,3.482438


In [76]:
#filtered_array.plot(x='words', y='computed_perf')

In [15]:
hot_words = ['listing', 'list', 'trading']
wordFilter = perf_df['title'].apply(format_title).apply(lambda words: any(word in hot_words for word in words))

strategyFilter = perf_df['strategy'] == 'charly_S30W5L5'
extractorFilter =  perf_df['extractor'] == 'relatedAgainstUsdt'

perf_ser = perf_df[strategyFilter & extractorFilter& wordFilter]['performance']
print(perf_ser.describe())
perf_ser.apply(lambda p: 1+p/100).prod()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: performance, dtype: float64


1.0