<a href="https://colab.research.google.com/github/AlexandreBourrieau/ML/blob/main/Carnets%20Jupyter/S%C3%A9ries%20temporelles/Bitcoin/Bitcoin_Identification_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

# Téléchargement des données

In [None]:
!wget --no-check-certificate --content-disposition "https://github.com/AlexandreBourrieau/ML/blob/main/Carnets%20Jupyter/S%C3%A9ries%20temporelles/Bitcoin/Bitcoin_complet.zip?raw=true"
!unzip Bitcoin_complet.zip

In [None]:
df_data=pd.read_csv('Bitcoin_complet.csv')
df_data.shape

In [None]:
df_data.head()

# Préparation des jeux de données

In [None]:
# Déplace la colonne Prix au début
col_prix = df_data.pop('Price')
df_data.insert(1,'Price',col_prix)
df_data.head()

In [None]:
X_raw = df_data.iloc[:,2:20]
X_raw.head()

In [None]:
y = df_data.iloc[:,1:2]
y.head()

# Identification des variables par Random Forest : Choix des paramètres

**1. Construction du dataframe sur 1 jour**

In [None]:
X = df_data
X = X.drop(columns=['Price','Dates'])
Xdrop = SimpleImputer(missing_values=np.nan,strategy='most_frequent').fit_transform(X)
Xdrop = pd.DataFrame(Xdrop)
Xdrop.columns =X.columns
X = Xdrop
X

**2. Choix du nombre d'arbres :**

In [None]:
# Informations sur les données
n = 2309              # Nombre d'observations
p = 778               # Nombre de variables

n_arbres_max = 25

n_arbres = np.linspace(1,n_arbres_max,10).astype(np.int32)
mtry = np.sqrt(p).astype(np.int32)
OOB_err = []

for i in n_arbres:
  print("#Arbres : %d" %i)
  clf = RandomForestRegressor(n_estimators=i, bootstrap=True, oob_score=True, max_samples = n, max_features = mtry, n_jobs=-1)
  clf.fit(X,tf.squeeze(np.asarray(y),1))
  OOB_err.append(1 - clf.oob_score_)

In [None]:
plt.plot(n_arbres,OOB_err)

On choisit n_arbres = 25

**3. Choix de la valeur de mtry (nombre de variables testées à chaque division)**

In [None]:
(np.sqrt(p)/2).astype(np.int32)

In [None]:
# Informations sur les données
n = 2309              # Nombre d'observations
p = 778               # Nombre de variables

n_arbres = 25
mtry_0 = (np.sqrt(p)/2).astype(np.int32)

m_try = np.linspace(mtry_0,778,10).astype(np.int32)

OOB_err = []

for i in m_try:
   print("mtry = %s" %i)
   clf = RandomForestRegressor(n_estimators=n_arbres, bootstrap=True, oob_score=True, max_features=i, n_jobs=-1)
   clf.fit(X,tf.squeeze(np.asarray(y),1))
   OOB_err.append(1 - clf.oob_score_)

In [None]:
plt.plot(m_try,OOB_err)

On choisit mtry = 768

# Importance des variables - Pemière approche : Importance sans permutations

**1. Entrainement de la forêt**

In [None]:
from sklearn.inspection import permutation_importance

# Informations sur les données
n = 102               # Nombre d'observations
p = 6033              # Nombre de variables
n_arbres = 25
m_try = 768

clf = RandomForestRegressor(n_estimators=n_arbres, bootstrap=True, oob_score=True, max_features=m_try, n_jobs=-1)
clf.fit(X,tf.squeeze(np.asarray(y),1))

**2. Affichage de l'importance des variables**

In [None]:
col_sorted_by_importance=clf.feature_importances_.argsort()
feat_imp = pd.DataFrame({'cols':X.columns[col_sorted_by_importance],'imps':clf.feature_importances_[col_sorted_by_importance]})
feat_imp

In [None]:
!pip install plotly_express --upgrade -q

In [None]:
import plotly_express as px
import plotly.offline as po

px.bar(feat_imp.sort_values(['imps'], ascending=False)[:30], x='cols', y='imps', labels={'cols':'column', 'imps':'feature importance'})

# Importance des variables - Deuxième approche : Méthode par permutations des importances

Permutation importance is a technique where we shuffle the values of a single column and run the model to see how the scores get affected. If the scores are affected greatly, then the feature is highly important to the model and if not, it does not add significant value to the model.

Let us see the feature importances for recall score on the test dataset.

In [None]:
import random

def PermImportance(X, y, clf, metric, num_iterations=100):
    '''
    Calculates the permutation importance of features in a dataset.
    Inputs:
    X: dataframe with all the features
    y: array-like sequence of labels
    clf: sklearn classifier, already trained on training data
    num_iterations: no. of repetitive runs of the permutation
    Outputs:
    baseline: the baseline metric without any of the columns permutated
    scores: differences in baseline metric caused by permutation of each feature, dict in the format {feature:[diffs]}
    '''
    bar=progressbar.ProgressBar(max_value=len(X.columns))
#    baseline_metric=metric(y, clf.predict(X))
    baseline_metric=clf.score(X,y)
    scores={c:[] for c in X.columns}
    for c in X.columns:
        X1=X.copy(deep=True)
        for _ in range(num_iterations):
            temp=X1[c].tolist()
            random.shuffle(temp)
            X1[c]=temp
#            score=metric(y, clf.predict(X1))
            score = clf.score(X1,y)
            scores[c].append(baseline_metric-score)
        bar.update(X.columns.tolist().index(c))
    return baseline_metric, scores

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import progressbar

baseline, scores = PermImportance(X, tf.squeeze(np.asarray(y),1), clf, recall_score, num_iterations=10)

In [None]:
percent_changes={c:[] for c in X.columns}
for c in scores:
    for i in range(len(scores[c])):
        percent_changes[c].append(scores[c][i]/baseline*100)

In [None]:
pd.DataFrame.from_dict(percent_changes).melt().groupby(['variable']).mean().reset_index().sort_values(['value'], ascending=False)[:25]

In [None]:
px.bar(
    pd.DataFrame.from_dict(percent_changes).melt().groupby(['variable']).mean().reset_index().sort_values(['value'], ascending=False)[:25], 
    x='variable',y='value',labels={'variable':'column','value':'% change in recall'})

# Comparaison entre les deux méthodes

In [None]:
# Calcul des écarts-types et des moyennes
scores_std = {}
scores_mean = {}
for element in scores:
  scores_std[element] = np.std(scores[element])
  scores_mean[element] = np.mean(scores[element])

df_perm = pd.DataFrame.from_dict([scores_std, scores_mean]).transpose()
df_perm = df_perm.rename(columns={0:"std",1:"mean"})
df_perm = df_perm.sort_values(by=['mean'],ascending=False)
df_perm

In [None]:
feature_perm = df_perm[0:30]
feature_imp = feat_imp.sort_values(['imps'],ascending=False)
feature_imp = feature_imp[0:30]

tree_indices = np.arange(0, len(feature_imp)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

ax1.barh(tree_indices,feature_imp['imps'].values, height=0.7)
ax1.set_yticks(tree_indices)
ax1.set_yticklabels(feature_imp['cols'].values)
ax1.set_ylim((0, len(feature_imp)))

ax2.boxplot(feature_perm, vert=False,labels=feature_perm.index.values)
fig.tight_layout()
plt.show()

# Sélection des variables à partir des méthodes précédentes

**1. Elimination préliminaire dans les résultats de la première méthode**

In [None]:
feature_imp = feat_imp.sort_values(['imps'],ascending=False)
feature_imp = feature_imp.reset_index()
feature_imp


In [None]:
feature_imp = feat_imp.sort_values(['imps'],ascending=False)
feature_imp = feature_imp.reset_index()
feature_imp = feature_imp[0:100]
feature_imp = feature_imp.reset_index()

plt.plot(feature_imp.index.values,feature_imp['imps'])

In [None]:
feature_imp = feat_imp.sort_values(['imps'],ascending=False)
feature_imp = feature_imp.reset_index()
feature_imp = feature_imp[0:30]
feature_imp = feature_imp.reset_index()

plt.plot(feature_imp.index.values,feature_imp['imps'])

On retient les 30 premières variables

**2. Elimination préliminaire dans les résultats de la deuxième méthode**

On commence par rechercher le minimum où la courbe se stabilise :

In [None]:
feature_perm = df_perm.sort_values(['mean'],ascending=False)
feature_perm = feature_perm.reset_index()
feature_perm = feature_perm[0:8]
feature_perm = feature_perm.reset_index()

plt.plot(feature_perm.index.values,feature_perm['mean'])

On affiche maintenant l'écart type de chaque variable :

In [None]:
feature_perm = df_perm.sort_values(['mean'],ascending=False)
feature_perm = feature_perm.reset_index()
feature_perm = feature_perm[0:8]
feature_perm = feature_perm.reset_index()

plt.plot(feature_perm.index.values,feature_perm['std'])

On fit cette courbe avec un modèle CART (Classification and Regression Trees) :

In [None]:
from sklearn.tree import DecisionTreeRegressor

regr = DecisionTreeRegressor(criterion="mse")
regr.fit(np.reshape(np.array(feature_perm.index.values),(-1,1)),feature_perm['std'].values)
y_reg = regr.predict(np.reshape(np.array(feature_perm.index.values),(-1,1)))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()


fig.add_trace(go.Scatter(x=feature_perm.index.values,y=feature_perm['std'],line=dict(color='blue', width=1)))
fig.add_trace(go.Scatter(x=feature_perm.index.values,y=y_reg,line=dict(color='red', width=1)))

fig.update_xaxes(rangeslider_visible=True)
yaxis=dict(autorange = True,fixedrange= False)
fig.update_yaxes(yaxis)
fig.show()

In [None]:
index = []

for i in feature_perm.index.values.astype(np.int32):
  if feature_perm['std'].values[i] >= y_reg[i]:
    index.append(i)

In [None]:
index

In [None]:
variables_selected = feature_perm.iloc[index]
variables_selected = variables_selected.drop(columns='level_0')
variables_selected['index'].values

**Fusion des valeurs**

In [None]:
df_reduit1 = X[feature_imp['cols'].values]
df_reduit1

In [None]:
df_reduit2 = X[variables_selected['index'].values]
df_reduit2

In [None]:
df_merged=df_reduit2.merge(df_reduit1,how='outer')
df_merged

**Elimination des variables corrélées**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#function for removing features with high vif
def drop_high_vif(X, thresh=100):
    cols = X.columns
    variables = np.arange(X.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X.columns[variables])
    return X[cols[variables]]

#function for listing vif values
def vif_values(X):
    add_constant(X)
    df=pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
    return df

In [None]:
variables_selected_reduit = drop_high_vif(df_merged,thresh=10)
variables_selected_reduit

# Sélection des variables par méthode RFE

In [None]:
from sklearn.feature_selection import RFE

# Informations sur les données
n = 102               # Nombre d'observations
p = 6033              # Nombre de variables
n_arbres = 25
m_try = 768

clf = RandomForestRegressor(n_estimators=n_arbres, bootstrap=True, oob_score=True, max_features=m_try, n_jobs=-1)
rfe = RFE(estimator=clf, n_features_to_select=10, step=1, verbose=1)
rfe.fit(X, tf.squeeze(np.asarray(y),1))

# Sélection des variables par méthode RFE-CV

In [None]:
from sklearn.feature_selection import RFECV

# Informations sur les données
n = 102               # Nombre d'observations
p = 6033              # Nombre de variables
n_arbres = 100
m_try = 768

clf = RandomForestRegressor(n_estimators=n_arbres, bootstrap=True, oob_score=True, max_features="auto", n_jobs=-1)
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='neg_mean_absolute_error',min_features_to_select=1, verbose=1)
rfecv.fit(X, tf.squeeze(np.asarray(y),1))

# Autre

In [None]:
features_list=[]
indicateurs_techniques=['sma','ema','wma','trix', 'std','skew','rsi','roc']
periode=['3','7','14','30','90']

for indicateur in indicateurs_techniques:
    for i in periode:
      print("%s%s" %(indicateur, i))
      filtre = str(indicateur) + str(i) + "$"

      X = df_data.filter(regex=filtre,axis=1)
      X = SimpleImputer(missing_values=np.nan,strategy='most_frequent').fit_transform(X)
      X = pd.DataFrame(X)
      X.columns = df_data.filter(regex=filtre,axis=1).columns

      rf1 = RandomForestRegressor(random_state=7,n_jobs=-1)
      rfecv = RFECV(rf1,step=0.9,min_features_to_select=1,verbose=1,cv=5,scoring='neg_mean_absolute_error', n_jobs=1)
      rfecv.fit(X,tf.squeeze(np.asarray(y),1))

      if rfecv.n_features_ > 1:
        rf1.fit(X,tf.squeeze(np.asarray(y),1))
        maximp = rf1.feature_importances_.max()
        for x in range(len(rf1.feature_importances_)):
          if maximp==rf1.feature_importances_[x]:
            new_features = X.columns[x]
      else:
        mask = rfecv.get_support()
        new_features = X.columns[mask]
      features_list.append(str(new_features))
      print(filtre+ ': ' + new_features)

In [None]:
features_list

In [None]:
l1=[]

for j in range(len(features_list)):
    result1 = re.search("'(.*)'],", features_list[j])
    if result1!=None:
        l1.append(result1.group(1))
        
for i in range(len(features_list)):
    result2 = re.search('.*',features_list[i])
    if len(result2.group(0))<33:
        l1.append(result2.group(0))

In [None]:
l1.sort()
l1

In [None]:
df_data_reduit=df_data[l1]
df_data_reduit.head()

Suppresion des valeurs non numériques :

In [None]:
df_data_reduit = SimpleImputer(missing_values=np.nan,strategy='most_frequent').fit_transform(df_data_reduit)
df_data_reduit = pd.DataFrame(X)
df_data_reduit.columns = df_data_reduit.columns
df_data_reduit

# Identifications des variables réstantes collinéraires 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#function for removing features with high vif
def drop_high_vif(X, thresh=100):
    cols = X.columns
    variables = np.arange(X.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X.columns[variables])
    return X[cols[variables]]

#function for listing vif values
def vif_values(X):
    add_constant(X)
    df=pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
    return df


In [None]:
df_data_reduit = drop_high_vif(df_data_reduit,thresh=5)
df_data_reduit

In [None]:
vif_values(df_data_reduit)

In [None]:
Raw=drop_high_vif(X_raw,thresh=10)

In [None]:
vif_values(Raw)

In [None]:
Raw.insert(0,'Dates',df_data['Dates'])
df_data_reduit.insert(0,'Dates',df_data['Dates'])

In [None]:
df_merged=Raw.merge(df_data_reduit,how='outer')
df_merged.drop_duplicates()

In [None]:
df_merged.drop(columns='Dates',inplace=True)

In [None]:
df_data_reduit=drop_high_vif(df_merged,thresh=10)
df_data_reduit

In [None]:
vif_values(df_data_reduit)

In [None]:
rf_final=RandomForestRegressor(random_state=7,n_jobs=-1)
rf_final.fit(df_data_reduit,tf.squeeze(np.asarray(y),1))

In [None]:
# function for creating a feature importance dataframe
def feature_importance(column_names, importances):
    df = pd.DataFrame({'feature': column_names, 'feature_importance': importances}).sort_values('feature_importance', ascending = False).reset_index(drop = True)
    return df

In [None]:
# plotting a feature importance dataframe (horizontal barchart)
def plot_feature_importance(imp_df, title):
    # figure size in inches
    plt.rcParams['figure.figsize'] = 11.7,8.27
    imp_df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, color = 'royalblue').set_title(title, fontsize = 20)

In [None]:
imp_feat=feature_importance(df_data_reduit.columns,rf_final.feature_importances_)
plot_feature_importance(imp_feat,'feature importance')

In [None]:
imp_feat[imp_feat['feature_importance']>0]

In [None]:
selected = imp_feat[imp_feat['feature_importance']>0]
selected

In [None]:
selected.feature_importance.sum()

In [None]:
df_data_reduit_high = df_data_reduit[np.asarray(selected.feature)]
df_data_reduit_high.sort_index(axis=1,inplace=True)
df_data_reduit_high['Price'] = np.ravel(y)
df_data_reduit_high = df_data_reduit_high[df_data_reduit_high['Price']!=1]
df_data_reduit_high

In [None]:
corr = df_data_reduit_high.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 8))

sns.heatmap(corr,mask=mask, cmap='coolwarm',annot=True, fmt='.2f')

In [None]:
from sklearn.ensemble import IsolationForest

anomalies=IsolationForest(contamination=0.1)

price=df_data_reduit_high['Price']
price=price.values
price=np.reshape(price,(-1,1))
anomalies.fit(price)

pred=anomalies.predict(price)

df_data_reduit_high['Anomalies'] = pred
df_data_reduit_high['Anomalies'] = df_data_reduit_high['Anomalies'].apply(lambda x: 1 if (x==-1) else 0)

# Affiche les informations sur les anomalies
print(df_data_reduit_high['Anomalies'].value_counts())

In [None]:
import plotly.express as px

# Affiche la série

fig = px.line(x=df_data_reduit_high.index,y=df_data_reduit_high['Price'],title="Evolution du prix du BTC")
fig.add_trace(px.scatter(x=df_data_reduit_high.index,y=df_data_reduit_high['Anomalies']*df_data_reduit_high['Price'],color=df_data_reduit_high['Anomalies'].astype(np.bool)).data[0])

fig.update_xaxes(rangeslider_visible=True)
yaxis=dict(autorange = True,fixedrange= False)
fig.update_yaxes(yaxis)
fig.show()

In [None]:
#df_data_reduit_high.reset_index(drop=True,inplace=True)
df_data_reduit_high.drop(columns=['Anomalies'],inplace=True)

In [None]:
df_data_reduit_high.insert(0,'Dates',df_data['Dates'])
df_data_reduit_high.set_index(df_data_reduit_high['Dates'])
df_data_reduit_high

In [None]:
df_data_reduit_high.columns

In [None]:
df_data_reduit_high.to_csv('reg_1d.csv')