# Тест соединения с базой

In [189]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
from alex.database import Database

In [190]:
connection_string = f"mssql+pymssql://Alex:cawa1728@10.7.0.19/Arbitron"
engine = create_engine(connection_string)
connection = engine.connect()

In [191]:
def query(sql):
    """
    executes query against MSSQL, returns Pandas Dataframe
    """
    try:
        df = pd.read_sql_query(sql, connection)  # выполняем sql запрос и записываем результат в pandas dataframe
        return df

    except Exception as e:
        print(e)

In [192]:
exchange = "binance"
pair = "ETH/USDT"

## Order book

In [193]:
#sql_orderbook = f"select * from v_order_book with (snapshot) where exchange='{exchange}' and pair='{pair}'"
#df_orderbook = query(sql_orderbook)

In [194]:
db = Database()

In [195]:
sql_orderbook = f"select * from tvf_get_order_book_snapshots(1440) where exchange='{exchange}' and pair='{pair}'"
df_orderbook = db.query(sql_orderbook)

In [196]:
df_orderbook.sort_values('dt').tail()

Unnamed: 0,dt,exchange,pair,bid_ask,price,amount,volume
61800,2018-09-30 02:40:19,binance,ETH/USDT,bid,229.559998,0.86969,199.646042
61799,2018-09-30 02:40:19,binance,ETH/USDT,bid,231.009995,40.78178,9420.999023
61798,2018-09-30 02:40:19,binance,ETH/USDT,bid,229.300003,8.74042,2004.178467
61808,2018-09-30 02:40:19,binance,ETH/USDT,bid,229.630005,0.09473,21.75285
61733,2018-09-30 02:40:19,binance,ETH/USDT,ask,235.429993,4.10987,967.58667


## History

In [197]:
sql_history = f"select * from v_history with (snapshot) where exchange='{exchange}' and pair='{pair}' and dt>'2018-09-17'"

In [198]:
df_history = query(sql_history)

In [199]:
df_history.set_index('dt', inplace=True)

In [200]:
#group date by hour, agg - mean
df = df_history[['price']].resample('T').mean()

In [201]:
df_orderbook.set_index('dt', inplace=True)

In [202]:
df_order_grouped = df_orderbook[['amount','volume']].resample('T').sum()
df_order_grouped['price_orderbook'] = df_order_grouped.volume / df_order_grouped.amount

In [203]:
df_order_grouped.dropna(inplace=True)
df.dropna(inplace=True)

In [204]:
df_merged = df.merge(df_order_grouped, on='dt', suffixes=('_history', '_orderbook'))
#df_merged['price_orderbook'] = 

In [205]:
df_merged.head()

Unnamed: 0_level_0,price,amount,volume,price_orderbook
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-29 03:24:00,219.571359,2286.503137,500993.3,219.108966
2018-09-29 06:11:00,216.780394,4498.562605,972097.1,216.0906
2018-09-29 06:31:00,216.228906,8958.765456,1933742.0,215.849203
2018-09-29 06:58:00,216.904489,3757.690953,812599.5,216.249682
2018-09-29 08:34:00,217.333154,3439.34093,745338.0,216.709532


In [207]:
# добавляем скользящие средние
df_merged['price_history_pct'] = df_merged.price.pct_change()
df_merged['price_orderbook_pct'] = df_merged.price_orderbook.pct_change()
df_merged.head()

Unnamed: 0_level_0,price,amount,volume,price_orderbook,price_history_pct,price_orderbook_pct
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-29 03:24:00,219.571359,2286.503137,500993.3,219.108966,,
2018-09-29 06:11:00,216.780394,4498.562605,972097.1,216.0906,-0.012711,-0.013776
2018-09-29 06:31:00,216.228906,8958.765456,1933742.0,215.849203,-0.002544,-0.001117
2018-09-29 06:58:00,216.904489,3757.690953,812599.5,216.249682,0.003124,0.001855
2018-09-29 08:34:00,217.333154,3439.34093,745338.0,216.709532,0.001976,0.002126


In [208]:
#forward fill nans
df_merged = df_merged.fillna(method='ffill')

In [211]:
#add binar column: if price goes up 1 else 0
df_merged['isup'] = np.where(df_merged.price > df_merged.price.shift(1), 1, 0)

#add rolling mean
df_merged['price_ma_hist'] = df_merged.price.rolling(window=10).mean() # цены из истории - среднее за 10 мин
df_merged['price_ma_ord'] = df_merged.price_orderbook.rolling(window=11).mean() # из ордербука - на 1 мин. раньше

In [212]:
df_merged.dropna(inplace=True)
df_merged.head()

Unnamed: 0_level_0,price,amount,volume,price_orderbook,price_history_pct,price_orderbook_pct,isup,price_ma_hist,price_ma_ord
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-29 10:58:00,219.982219,6354.725779,1391515.0,218.973266,0.010949,0.009542,1,217.313064,216.792457
2018-09-29 11:01:00,223.201661,8191.498607,1820521.0,222.245158,0.014635,0.014942,1,217.95519,217.077565
2018-09-29 11:08:00,224.199608,14020.4411,3136686.0,223.722381,0.004471,0.006647,1,218.752261,217.771363
2018-09-29 11:22:00,226.051012,3852.24747,870394.5,225.944595,0.008258,0.009933,1,219.666913,218.689126
2018-09-29 11:26:00,226.181034,4129.773055,933350.7,226.005314,0.000575,0.000269,1,220.551701,219.576002


In [None]:
df_merged['2018-09-29 11:00':'2018-09-29 22:00'][['price_history','price_ma_hist', 'price_ma_ord']].plot(figsize=(16,7), grid=True )

### Логистическая регрессия. Независ.переменные - скользящие цены

In [None]:
# выбор независимый переменных 
feature_cols = ['price_ma_hist', 'price_ma_ord']

In [None]:
X = df_merged[feature_cols]
X.shape

In [None]:
# зависимая переменная
y = df_merged.isup
y.shape

In [None]:
# запуск модели лог.регрессии
logreg = LogisticRegression()
# обучение
#logreg.fit(X, y)

In [None]:
# разбиение данных на тренировочную и тестовую части
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# обучение 
#logreg.fit(X_train, y_train)

In [None]:
# предсказание
#y_pred = logreg.predict(X_test)
#print(metrics.accuracy_score(y_test, y_pred))

In [None]:
accuracy = cross_val_score(logreg, X, y, cv=20, scoring='accuracy')
print(accuracy)

In [None]:
print(accuracy.mean())

In [None]:
plt.hist(df_merged.price_history_pct.dropna())

### Логистическая регрессия, переменные - процентные изменения цен history и orderbook

In [None]:
X2 = df_merged[['price_history_pct', 'price_orderbook_pct']]
y2 = df_merged.isup
accuracy2 = cross_val_score(LogisticRegression(), X2, y2, cv=10, scoring='accuracy')
print(accuracy2)
print()
print('средняя точность по папкам', accuracy2.mean())

### Логистическая регрессия, переменные - проц.изменения скользящих средних цен

In [None]:
df_merged['price_ma_hist_pct'] = df_merged.price_ma_hist.pct_change()
df_merged['price_ma_ord_pct'] = df_merged.price_ma_ord.pct_change()
X3 = df_merged[['price_ma_hist', 'price_ma_ord']]
y3 = df_merged.isup
accuracy3 = cross_val_score(LogisticRegression(), X3, y3, cv=10, scoring='accuracy')
print(accuracy3)
print()
print('средняя точность по папкам', accuracy3.mean())

### Логистическая регрессия, 1 переменная - скользящая средняя цена history

In [None]:
X4 = df_merged[['price_ma_hist']]
y4 = df_merged.isup
accuracy4 = cross_val_score(LogisticRegression(), X4, y4, cv=10, scoring='accuracy')
print(accuracy4)
print()
print('средняя точность по папкам', accuracy4.mean())

In [None]:
cross_val_predict(LogisticRegression(), X3, y3, cv=10)

In [None]:
df_merged[['price_ma_hist_pct', 'price_ma_ord_pct']].plot(figsize=(17,10), grid=True)

In [None]:
df_merged.iloc[60:70, 7:9].plot(figsize=(14,8))
plt.xticks()

In [None]:
df_history['2018-09-29 16:00':'2018-09-29 17:00'].resample('T').price.mean().pct_change().plot(figsize=(14,5))

In [None]:
#df_history['2018-09-29 10:00':'2018-09-29 23:00'].resample('T').price.mean().pct_change().plot(figsize=(14,8))
#plt.plot(x,y,label='цена история')

df_orderbook['2018-09-29 16:00':'2018-09-29 17:00'].resample('T').volume.median().pct_change().plot(figsize=(14,5))

In [None]:
orderbook_array = df_orderbook['2018-09-29 0:00':'2018-09-29 22:00'].resample('T').volume.median().pct_change()
history_array = df_history['2018-09-29 0:00':'2018-09-29 22:00'].resample('T').price.mean().pct_change()

In [None]:
history_array.corr(orderbook_array)

In [None]:
len(orderbook_array), len(history_array)

In [None]:
orderbook_array.to_excel('order_vol.xlsx')

In [None]:
df_merged['order_volume_pct'] = 