In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import set_config; set_config(display='diagram')

BASE_DATA_PATH = "../../CryptoBotPrueba/ETHUSDT-1m-2022-05.csv"
COLUMN_NAMES = ["open_time","open","high","low", "close",
                         "volume", "close_time" ,"quote_asset_volume",
                         "number_of_trades", "taker_buy_base_asset_volume",
                         "taker_buy_quote_asset_volume", "ignore"]

def get_data_without_headers(path, columns):
    df = pd.read_csv(path,
                 header=None, 
                 names= columns)
    return df

df = get_data_without_headers(BASE_DATA_PATH,COLUMN_NAMES)
df.head(3)


Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,1651363200000,2726.67,2729.36,2725.86,2728.04,628.9811,1651363259999,1715749.0,700,348.222,949877.534398,0
1,1651363260000,2728.05,2728.59,2726.51,2727.2,368.7584,1651363319999,1005854.0,484,279.6466,762819.463711,0
2,1651363320000,2727.21,2731.74,2727.2,2731.67,563.1505,1651363379999,1537359.0,705,364.8842,995979.439779,0


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44640 entries, 0 to 44639
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   open_time                     44640 non-null  int64  
 1   open                          44640 non-null  float64
 2   high                          44640 non-null  float64
 3   low                           44640 non-null  float64
 4   close                         44640 non-null  float64
 5   volume                        44640 non-null  float64
 6   close_time                    44640 non-null  int64  
 7   quote_asset_volume            44640 non-null  float64
 8   number_of_trades              44640 non-null  int64  
 9   taker_buy_base_asset_volume   44640 non-null  float64
 10  taker_buy_quote_asset_volume  44640 non-null  float64
 11  ignore                        44640 non-null  int64  
dtypes: float64(8), int64(4)
memory usage: 4.1 MB


In [3]:
def base_time_coversion(df):
    df.open_time = df.open_time.apply(lambda x: datetime.utcfromtimestamp(x/1000))
    df.close_time = df.close_time.apply(lambda x: datetime.utcfromtimestamp(x/1000))
    return df

df = base_time_coversion(df)
df.head(3)

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2022-05-01 00:00:00,2726.67,2729.36,2725.86,2728.04,628.9811,2022-05-01 00:00:59.999,1715749.0,700,348.222,949877.534398,0
1,2022-05-01 00:01:00,2728.05,2728.59,2726.51,2727.2,368.7584,2022-05-01 00:01:59.999,1005854.0,484,279.6466,762819.463711,0
2,2022-05-01 00:02:00,2727.21,2731.74,2727.2,2731.67,563.1505,2022-05-01 00:02:59.999,1537359.0,705,364.8842,995979.439779,0


In [4]:
# Feautre Union in Pipeline


def define_target(df):
    df["target"] = (df.open - df.close).apply(lambda x: 0 if x <=0 else 1)
    return df

df = define_target(df)
df.head(3)

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore,target
0,2022-05-01 00:00:00,2726.67,2729.36,2725.86,2728.04,628.9811,2022-05-01 00:00:59.999,1715749.0,700,348.222,949877.534398,0,0
1,2022-05-01 00:01:00,2728.05,2728.59,2726.51,2727.2,368.7584,2022-05-01 00:01:59.999,1005854.0,484,279.6466,762819.463711,0,1
2,2022-05-01 00:02:00,2727.21,2731.74,2727.2,2731.67,563.1505,2022-05-01 00:02:59.999,1537359.0,705,364.8842,995979.439779,0,0


In [5]:
FEAR_GREED_PATH = "../../CryptoBotPrueba/fear_greed_index.csv"

def get_data_with_headers(path):
    fg = pd.read_csv(path)
    return fg

fg = get_data_with_headers(FEAR_GREED_PATH)
fg.head(3)

Unnamed: 0.1,Unnamed: 0,value,value_classification,timestamp
0,0,13,Extreme Fear,1654473600
1,1,10,Extreme Fear,1654387200
2,2,14,Extreme Fear,1654300800


In [6]:
def fg_time_conversion(fg):
    fg["timestamp"] = fg.timestamp.apply(lambda x: datetime.utcfromtimestamp(int(x)))
    return fg

fg = fg_time_conversion(fg)
fg.head(3)

Unnamed: 0.1,Unnamed: 0,value,value_classification,timestamp
0,0,13,Extreme Fear,2022-06-06
1,1,10,Extreme Fear,2022-06-05
2,2,14,Extreme Fear,2022-06-04


In [7]:

# def fg_time_to_string(df, fg):
#     df["close_time_str"] = df["close_time"].apply(lambda x: x.strftime("%Y-%m-%d"))
#     fg["close_time_str"] = fg["timestamp"].apply(lambda x: x.strftime("%Y-%m-%d"))
#     return (df, fg)

# df, fg = fg_time_to_string(df, fg)



In [8]:
#.strftime turns data to string!!! .strftime('%Y-%m-%d %H:%M:%S'))

def df_time_to_string(df):
    df["close_time_str"] = df["close_time"].apply(lambda x: x.strftime("%Y-%m-%d"))
    return df

def fg_time_to_string(fg):
    fg["close_time_str"] = fg["timestamp"].apply(lambda x: x.strftime("%Y-%m-%d"))
    return fg

df = df_time_to_string(df)
fg = fg_time_to_string(fg)


In [9]:
df.head(3)

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore,target,close_time_str
0,2022-05-01 00:00:00,2726.67,2729.36,2725.86,2728.04,628.9811,2022-05-01 00:00:59.999,1715749.0,700,348.222,949877.534398,0,0,2022-05-01
1,2022-05-01 00:01:00,2728.05,2728.59,2726.51,2727.2,368.7584,2022-05-01 00:01:59.999,1005854.0,484,279.6466,762819.463711,0,1,2022-05-01
2,2022-05-01 00:02:00,2727.21,2731.74,2727.2,2731.67,563.1505,2022-05-01 00:02:59.999,1537359.0,705,364.8842,995979.439779,0,0,2022-05-01


In [10]:

fg.head(3)

Unnamed: 0.1,Unnamed: 0,value,value_classification,timestamp,close_time_str
0,0,13,Extreme Fear,2022-06-06,2022-06-06
1,1,10,Extreme Fear,2022-06-05,2022-06-05
2,2,14,Extreme Fear,2022-06-04,2022-06-04


In [11]:
def merge_df_fg(df,fg):
    df_fg = df.merge(fg, on = "close_time_str" )
    return df_fg

df_fg =  merge_df_fg(df,fg)
df_fg.head(3)

Unnamed: 0.1,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore,target,close_time_str,Unnamed: 0,value,value_classification,timestamp
0,2022-05-01 00:00:00,2726.67,2729.36,2725.86,2728.04,628.9811,2022-05-01 00:00:59.999,1715749.0,700,348.222,949877.534398,0,0,2022-05-01,36,22,Extreme Fear,2022-05-01
1,2022-05-01 00:01:00,2728.05,2728.59,2726.51,2727.2,368.7584,2022-05-01 00:01:59.999,1005854.0,484,279.6466,762819.463711,0,1,2022-05-01,36,22,Extreme Fear,2022-05-01
2,2022-05-01 00:02:00,2727.21,2731.74,2727.2,2731.67,563.1505,2022-05-01 00:02:59.999,1537359.0,705,364.8842,995979.439779,0,0,2022-05-01,36,22,Extreme Fear,2022-05-01


In [12]:
# Scaling numerical values

COLUMNS_LIST = ["open", "high", "low", "close", "volume", "number_of_trades", "taker_buy_base_asset_volume"]

def num_scaler(df_fg, columns):
    
    r_scaler = RobustScaler() # Instanciate Robust Scaler
    r_scaler.fit(df_fg[columns]) # Fit scaler to feature
    df_fg[COLUMNS_LIST] = r_scaler.transform(df_fg[COLUMNS_LIST]) #Scale
    return df_fg

df_fg = num_scaler(df_fg, COLUMNS_LIST)
df_fg.head(3)

Unnamed: 0.1,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore,target,close_time_str,Unnamed: 0,value,value_classification,timestamp
0,2022-05-01 00:00:00,1.206339,1.207481,1.208926,1.208827,0.651466,2022-05-01 00:00:59.999,1715749.0,0.531621,0.782311,949877.534398,0,0,2022-05-01,36,22,Extreme Fear,2022-05-01
1,2022-05-01 00:01:00,1.208751,1.206136,1.210063,1.207358,0.114019,2022-05-01 00:01:59.999,1005854.0,0.104743,0.505949,762819.463711,0,1,2022-05-01,36,22,Extreme Fear,2022-05-01
2,2022-05-01 00:02:00,1.207283,1.211638,1.21127,1.215173,0.515504,2022-05-01 00:02:59.999,1537359.0,0.541502,0.849461,995979.439779,0,0,2022-05-01,36,22,Extreme Fear,2022-05-01


In [13]:
# Encoding categorical values
from sklearn.preprocessing import LabelEncoder

def value_class_encoder(df_fg):
    le = LabelEncoder()
    le.fit(df_fg['value_classification'])
    df_fg['value_classification'] = le.transform(df_fg['value_classification'])
    return df_fg

df_fg = value_class_encoder(df_fg)
df_fg.head(3)



Unnamed: 0.1,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore,target,close_time_str,Unnamed: 0,value,value_classification,timestamp
0,2022-05-01 00:00:00,1.206339,1.207481,1.208926,1.208827,0.651466,2022-05-01 00:00:59.999,1715749.0,0.531621,0.782311,949877.534398,0,0,2022-05-01,36,22,0,2022-05-01
1,2022-05-01 00:01:00,1.208751,1.206136,1.210063,1.207358,0.114019,2022-05-01 00:01:59.999,1005854.0,0.104743,0.505949,762819.463711,0,1,2022-05-01,36,22,0,2022-05-01
2,2022-05-01 00:02:00,1.207283,1.211638,1.21127,1.215173,0.515504,2022-05-01 00:02:59.999,1537359.0,0.541502,0.849461,995979.439779,0,0,2022-05-01,36,22,0,2022-05-01


In [14]:
def split_data(df):
    X=df[["open_time", "open", "high", "low", "close",
         "volume", "close_time", "quote_asset_volume",
         "number_of_trades", "taker_buy_base_asset_volume", 
         "taker_buy_quote_asset_volume", "value","value_classification"]]
    y=pd.DataFrame(df["target"])
    return (X,y)

X,y = split_data(df_fg)


In [15]:
X.head(3)

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,value,value_classification
0,2022-05-01 00:00:00,1.206339,1.207481,1.208926,1.208827,0.651466,2022-05-01 00:00:59.999,1715749.0,0.531621,0.782311,949877.534398,22,0
1,2022-05-01 00:01:00,1.208751,1.206136,1.210063,1.207358,0.114019,2022-05-01 00:01:59.999,1005854.0,0.104743,0.505949,762819.463711,22,0
2,2022-05-01 00:02:00,1.207283,1.211638,1.21127,1.215173,0.515504,2022-05-01 00:02:59.999,1537359.0,0.541502,0.849461,995979.439779,22,0


In [16]:
y.head(3)

Unnamed: 0,target
0,0
1,1
2,0


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn import set_config; set_config(display='diagram')


# Impute then Scale for numerical variables: 
num_transformer = RobustScaler()

# Encode target:
target_transformer = LabelEncoder()

# Transform date:


# Paralellize "num_transformer" and "One hot encoder"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, COLUMNS_LIST),
    ('cat_tr', target_transformer, ['value_classification'])],
    remainder='passthrough')

preprocessor

In [18]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

# Create a custom transformer that multiplies two columns
target = FunctionTransformer((df.open - df.close).apply(lambda x: 0 if x <=0 else 1))

close_time_str = FunctionTransformer(df["close_time"].apply(lambda x: x.strftime("%Y-%m-%d")))


union = FeatureUnion([
    ("preprocess", preprocessor), # columns 0-11
    ("target", target), # new column "target"
    ("close_time_str", close_time_str) # new column "close_time_str"
])
union

In [20]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting multitasking>=0.0.7
  Downloading multitasking-0.0.10.tar.gz (8.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: multitasking
  Building wheel for multitasking (setup.py) ... [?25ldone
[?25h  Created wheel for multitasking: filename=multitasking-0.0.10-py3-none-any.whl size=8488 sha256=bb05badf77bada8d9792cd644708b59fe8f43ae7124a502145a0a1dad9405d66
  Stored in directory: /home/bajiks/.cache/pip/wheels/21/c9/66/b41c847de65c7985db52ec21d59996841598b8b0e93f2b9500
Successfully built multitasking
Installing collected packages: multitasking, yfinance
Successfully installed multitasking-0.0.10 yfinance-0.1.70


In [21]:
from cryptobot.yahoo_market_data import get_yahoo_data


In [27]:
yd = get_yahoo_data("^TNX", "2020-08-05", "2020-09-08")

In [30]:
yd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       23 non-null     datetime64[ns]
 1   Open       23 non-null     float64       
 2   High       23 non-null     float64       
 3   Low        23 non-null     float64       
 4   Close      23 non-null     float64       
 5   Volume     23 non-null     int64         
 6   timestamp  23 non-null     float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 1.4 KB
