# Baseline

In here I create a basic baseline model with just some features and less preprocessing.

In [1]:
def read_in_data(path_bitcoin_df = '../data/raw/1_training_data_sets/1_bitcoin_price_data_set.csv',
                path_training_df = '../data/raw/1_training_data_sets/1_training_data.csv'):
    df_bitcoin = pd.read_csv('../data/raw/1_training_data_sets/1_bitcoin_price_data_set.csv', 
                            encoding = "ISO-8859-1")
    df = pd.read_csv('../data/raw/1_training_data_sets/1_training_data.csv',
                             encoding = "ISO-8859-1")
    
    print("Shape of df_bitcoin: {}".format(df_bitcoin.shape))
    print("Shape of df: {}".format(df.shape))
    return df_bitcoin, df

In [2]:
import pandas as pd
import numpy as np 
from datetime import datetime

In [3]:
df_bitcoin, df = read_in_data()

Shape of df_bitcoin: (2355, 1)
Shape of df: (4757, 91)


In [4]:
# Drop all NAs -> Just for baseline
df.dropna(inplace=True)
print("Shape after cleaning: {}", df.shape)

Shape after cleaning: {} (34, 91)


Too less -> Pick features first

## Pick basic features

In [5]:
df_features = df[[
    'categories_0',
    'transaction_count',
    'holder_count',
    'timestamp',
    'country_origin'
]]

In [6]:
df_features.head()

Unnamed: 0,categories_0,transaction_count,holder_count,timestamp,country_origin
2,Protocol,22730.0,13914.0,Sep-25-2019 11:58:48 PM,SI
85,Business Services,52659.0,32556.0,Sep-26-2019 12:17:20 PM,SC
585,Cryptocurrency,1407.0,545.0,Sep-24-2019 08:09:22 AM,GB
649,Communication,2018.0,512.0,Sep-26-2019 02:00:39 PM,MU
671,Business Platform,12341.0,3202.0,Sep-26-2019 01:24:41 PM,SG


In [7]:
for col in df_features.columns:
    print("Column {} has {} NA values".format(col, df_features[col].isna().sum()))

Column categories_0 has 0 NA values
Column transaction_count has 0 NA values
Column holder_count has 0 NA values
Column timestamp has 0 NA values
Column country_origin has 0 NA values


In [8]:
# Categories NA: OHE 
# Transaction count avg
# Holder count avg
# Timestamp remove
# Country origin OHE

## Category 1

In [9]:
# End df for model
df_X = df['OBS_ID'].copy()

In [10]:
ohe_cat1 = pd.get_dummies(df.categories_0, prefix='categories_1', dummy_na=True)
ohe_cat1 = pd.concat([df['OBS_ID'], ohe_cat1], axis=1) 

In [11]:
ohe_cat1.head()

Unnamed: 0,OBS_ID,categories_1_Artificial Intelligence,categories_1_Business Platform,categories_1_Business Services,categories_1_Charity,categories_1_Communication,categories_1_Cryptocurrency,categories_1_Decentralized Exchange,categories_1_Entertainment,categories_1_Exchange-based Tokens,categories_1_Finance / Banking,categories_1_Gambling,categories_1_Infrastructure,categories_1_Internet of Things (IOT),categories_1_Media,categories_1_Other,categories_1_Protocol,categories_1_Software,categories_1_nan
2,235693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
85,235776,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
585,236276,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
649,236340,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
671,236362,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
df_X = pd.merge(df_X, ohe_cat1)

## Transaction Count

In [13]:
df.transaction_count.fillna(df.transaction_count.mean(), inplace=True)

In [14]:
df_X = pd.merge(df_X, df[['OBS_ID', 'transaction_count']])

## Holder Count

In [15]:
df.holder_count.fillna(df.holder_count.mean(), inplace=True)

In [16]:
df_X = pd.merge(df_X, df[['OBS_ID', 'holder_count']])

## Timestamp

In [17]:
df = df[~df.timestamp.isna()]

In [18]:
df['timestamp']=df.timestamp.astype(str)

In [19]:
df.shape

(34, 91)

In [20]:
df = df.assign(timestamp_datetime = df.apply(lambda x:
                                            datetime.strptime(str(x.timestamp), '%b-%d-%Y %H:%M:%S %p'),
                                            axis=1))

In [21]:
df = df.assign(hour = df.apply(lambda x: x.timestamp_datetime.hour, axis=1))
df = df.assign(minute = df.apply(lambda x: x.timestamp_datetime.minute, axis=1))
df = df.assign(second = df.apply(lambda x: x.timestamp_datetime.second, axis=1))
df = df.assign(month = df.apply(lambda x: x.timestamp_datetime.month, axis=1))
df = df.assign(day = df.apply(lambda x: x.timestamp_datetime.day, axis=1))
df = df.assign(year = df.apply(lambda x: x.timestamp_datetime.year, axis=1))

df = df[df.year != 1970]
df = df[df.year != 2017]

In [22]:
df_X = pd.merge(df_X, df[['OBS_ID', 'hour', 'minute', 'second', 'month', 'day', 'year']])

## Country origin

In [23]:
ohe_country = pd.get_dummies(df.country_origin, prefix='country', dummy_na=True)
ohe_country = pd.concat([df['OBS_ID'], ohe_country], axis=1) 

In [24]:
df_X = pd.merge(df_X, ohe_country)

In [25]:
df_X.shape

(34, 47)

In [32]:
pd.read_csv('../data/processed/baselince.csv')

Unnamed: 0,OBS_ID,categories_1_Artificial Intelligence,categories_1_Business Platform,categories_1_Business Services,categories_1_Charity,categories_1_Communication,categories_1_Cryptocurrency,categories_1_Decentralized Exchange,categories_1_Entertainment,categories_1_Exchange-based Tokens,...,country_MU,country_MX,country_NL,country_RO,country_SC,country_SG,country_SI,country_UA,country_US,country_nan
0,235693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,235776,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,236276,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,236340,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,236362,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,236469,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,236603,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,237066,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
8,237134,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,237267,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df_X = pd.merge(df_X, df[['OBS_ID', 'success']])

In [36]:
df_X.to_csv('../data/processed/baselince.csv', index=None)

In [35]:
df_X

Unnamed: 0,OBS_ID,categories_1_Artificial Intelligence,categories_1_Business Platform,categories_1_Business Services,categories_1_Charity,categories_1_Communication,categories_1_Cryptocurrency,categories_1_Decentralized Exchange,categories_1_Entertainment,categories_1_Exchange-based Tokens,...,country_MX,country_NL,country_RO,country_SC,country_SG,country_SI,country_UA,country_US,country_nan,success
0,235693,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,235776,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,236276,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,236340,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,236362,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5,236469,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,236603,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,237066,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
8,237134,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,237267,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


# Training

In [29]:
!pip install scipy

[31mERROR: Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: '/home/sandro/anaconda3/lib/python3.7/site-packages/scikit_learn-0.20.3.dist-info/METADATA'
[0m


In [30]:
from sklearn.ensemble import RandomForestClassifier

ImportError: cannot import name 'RandomForestClassifier' from 'sklearn.ensemble' (unknown location)