# Imports

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor

In [29]:
random_state = 1000
df = pd.read_csv("crypto.csv")
data_size = 100000 #Change this to adjust the dataset size partition

# Data Preprocessing

Say that selected both datasets because in small datasets we got super good results immediately an

In [30]:
# df.isna().sum()
df.dropna(inplace = True)

In [None]:
# df.head() # We can see data is already sorted based on date so we can drop the date column.

In [None]:
# df.info()

In [31]:
df.drop(['time', 'id'], axis=1, inplace=True)

**Plotting the correlation map**

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
corr = df.corr()
sns.heatmap(corr,annot=True, cmap="YlGnBu")
# Shows almost no correlation between amount and closing price

**Scaling the data and creating datasets**

In [32]:
scale = StandardScaler()
df_sc = scale.fit_transform(df)
df_sc = pd.DataFrame(df_sc, columns=df.columns)

In [18]:
y = df_sc['close'] # The aim is to predict the closing price of Netflix' stock.
X = df_sc.drop('close', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X[:data_size], y[:data_size], test_size=0.4, random_state=random_state) # Splitting as 60% train test, 40% test set.

# Regression with SVR

In [None]:
SVM_regression = SVR(C=1, kernel='rbf')

In [None]:
SVM_regression.fit(X_train, y_train) # training the model.

In [None]:
y_pred = SVM_regression.predict(X_test) # Predicting values

In [None]:
predictions = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})

**Evaluating the model**

In [None]:
MSE_test = round(np.mean(np.square(y_test- y_pred)),2)
RMSE_test = round(np.sqrt(MSE_test),2)
print(RMSE_test, SVM_regression.score(X_test, y_test))

In [None]:
predictions.head()

# Grid Search and Optimal Model

In [None]:
param_grid = {'C': [1,10,100], 'gamma': [10, 1,0.1,0.01], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'degree' : [3,5,9]}

In [None]:
grid = GridSearchCV(estimator=SVR(),param_grid= param_grid, refit = True, verbose=3, cv=2)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

{'C': 1, 'degree': 3, 'gamma': 10, 'kernel': 'rbf'}

In [None]:
y_pred_best = grid.predict(X_test)

In [None]:
MSE_test = round(np.mean(np.square(y_test- y_pred_best)),2)
RMSE_test = round(np.sqrt(MSE_test),2)
print(RMSE_test, grid.best_estimator_.score(X_test, y_test))

# Optimal Model

In [20]:
optimal = SVR(C=1, gamma = 0.1, kernel = 'rbf') # Change the gamma to 0.1 and see what happens. Hint: Magic!

In [None]:
optimal.fit(X_train, y_train)

In [22]:
y_optimal = optimal.predict(X_test)

In [None]:
MSE_opt = round(np.mean(np.square(y_test - y_optimal)),4)
RMSE_opt = round(np.sqrt(MSE_opt),4)
print(RMSE_opt, optimal.score(X_test, y_test))

**Saving the trained model**

In [None]:
import joblib
joblib.dump(optimal, 'CryptoSVR.joblib') # To load the trained model simply use joblib.load('CryptoSVR.joblib')

**Training the model in an increasing amount of data**

In [None]:
partition = (1000, 10000, 100000, 250000, 500000)

In [None]:
 for i in partition:
  X_train, X_test, y_train, y_test = train_test_split(X[:i], y[:i], test_size=0.4, random_state=random_state)
  optimal.fit(X_train, y_train)
  y_optimal = optimal.predict(X_test)
  MSE_opt = round(np.mean(np.square(y_test - y_optimal)),4) #asafg
  print("Number of samples: ", i, "RMSE: ", MSE_opt, "R2 score" , optimal.score(X_test, y_test))

# Plotting the learning curve

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X[:data_size], y[:data_size], test_size=0.4, random_state=random_state) # Splitting as 60% train test, 40% test set.
# Increasing the amount of samples to train

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, ylim, cv, n_jobs, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title("Learning Curve")
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Usage example:
# Replace 'your_estimator', 'X_train', 'y_train' with your actual estimator and data
# Make sure to set appropriate ylim based on your metric (e.g., R^2 score ranges from -∞ to 1)
# You can also set other parameters like cv (cross-validation folds) and n_jobs (parallelization)
negative_infinity =  float('-inf')
plot_learning_curve(KNN, X_train, y_train, ylim=(0.8, 1), cv=2, n_jobs=-1)
plt.show()


# Compare with KNN

In [34]:
KNN = KNeighborsRegressor()

In [None]:
KNN.fit(X_train, y_train)

In [36]:
y_knn = KNN.predict(X_test)

In [None]:
MSE_opt_knn = round(np.mean(np.square(y_test - y_knn)),4)
print(MSE_opt_knn, KNN.score(X_test, y_test))