# Bitcoin Price Range History 20250901

This data is in format csv and has around 2 million rows. This dataset has columns as such

1. Start : Date (Indicates the start date of the data record) (Format XX-XX-XXXX) (NUMERIC)

2. End : Date ( Indicates the end date of the data record) (Format XX/XX/XXXX) (NUMERIC)

3. Open : Number ( The price at which Bitcoin Ended trading at the beginning of the day.) (NUMERIC)

4. High : Number (The highest price point reached by Bitcoin during the day.) (NUMERIC)

5. Low : Number (The lowest price point reached by Bitcoin during the day.) (NUMERIC)

6. Close : Number (The price at which Bitcoin ended trading at the close of the day.) (NUMERIC)

7. Volume : Number (Total volume of Bitcoin traded during the day.) 

8. Market Cap : Number (The total market value of Bitcoin at the end of the day.) 

In [None]:
# Imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from reinforcement.sarsa import SARSAWumpus
from reinforcement.qLearning import QLearningWumpus
from supervisedLearning.ann import ArtificialNeuralNetwork
from supervisedLearning.regressionTree import RegressionTree
from supervisedLearning.knn import KNearestNeigbor
from supervisedLearning.regression import PolynomialRegression
from supervisedLearning.svm import SupportVectorMachine
from unsupervisedLearning.dbscan import DBScan
from unsupervisedLearning.kMeans import KMeans
from unsupervisedLearning.pca import PCA

In [None]:
# read file
dataset1 = "../data/bitcoin_2010-07-17_2024-06-28.csv"
dataset2 = "../data/iris.csv"

df1 = pd.read_csv(dataset1)
df2 = pd.read_csv(dataset2)

# Exploratory Data Analysis

In [None]:
# jumlah data
print(df1.count())

## Mean

In [None]:
# start
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.day.mean())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.month.mean())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.year.mean())

#end 
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.day.mean())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.month.mean())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.year.mean())

# latitude
print(df1["Open"].mean())
print(df1["High"].mean())
print(df1["Low"].mean())
print(df1["Close"].mean())
print(df1["Volume"].mean())
print(df1["Market Cap"].mean())



## Standard Deviation

In [None]:
# start
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.day.std())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.month.std())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.year.std())

#end 
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.day.std())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.month.std())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.year.std())

# latitude
print(df1["Open"].std())
print(df1["High"].std())
print(df1["Low"].std())
print(df1["Close"].std())
print(df1["Volume"].std())
print(df1["Market Cap"].std())

## Minimum

In [None]:
# start
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.hour.min())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.minute.min())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.second.min())

#end 
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.date.min())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.month.min())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.year.min())

# latitude
print(df1["Open"].min())
print(df1["High"].min())
print(df1["Low"].min())
print(df1["Close"].min())
print(df1["Volume"].min())
print(df1["Market Cap"].min())

## Maximum

In [None]:
# start
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.hour.max())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.minute.max())
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.second.max())

#end 
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.date.max())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.month.max())
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.year.min())

# latitude
print(df1["Open"].max())
print(df1["High"].max())
print(df1["Low"].max())
print(df1["Close"].max())
print(df1["Volume"].max())
print(df1["Market Cap"].max())

## Quantile

In [None]:
# start
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.hour.quantile([0.25, 0.5, 0.75]))
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.minute.quantile([0.25, 0.5, 0.75]))
print(pd.to_datetime(df1["Start"], format="%Y-%m-%d").dt.second.quantile([0.25, 0.5, 0.75]))

#end 
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.date.quantile([0.25, 0.5, 0.75]))
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.month.quantile([0.25, 0.5, 0.75]))
print(pd.to_datetime(df1["End"], format='%Y-%m-%d').dt.year.quantile([0.25, 0.5, 0.75]))

# latitude
print(df1["Open"].quantile([0.25, 0.5, 0.75]))
print(df1["High"].quantile([0.25, 0.5, 0.75]))
print(df1["Low"].quantile([0.25, 0.5, 0.75]))
print(df1["Close"].quantile([0.25, 0.5, 0.75]))
print(df1["Volume"].quantile([0.25, 0.5, 0.75]))
print(df1["Market Cap"].quantile([0.25, 0.5, 0.75]))

# Data distribution check

## Numeric

In [None]:
# start
sns.histplot(pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.year, bins=24, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.year)
plt.show()
 
# start
sns.histplot(pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.month, bins=24, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.month)
plt.show()
 
# start
sns.histplot(pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.date, bins=24, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["Start"], format="%Y-%m-%d",  errors="coerce").dt.date)
plt.show()
 
# end
sns.histplot(pd.to_datetime(df1["End"], format="%Y-%m-%d",  errors="coerce").dt.year, bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.year)
plt.show()

# end
sns.histplot(pd.to_datetime(df1["End"], format="%Y-%m-%d",  errors="coerce").dt.month, bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.month)
plt.show()

# end
sns.histplot(pd.to_datetime(df1["End"], format="%Y-%m-%d",  errors="coerce").dt.date, bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.date)
plt.show()

# open
sns.histplot(df1["Open"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["Open"])
plt.show()

# open
sns.histplot(df1["High"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["High"])
plt.show()

# open
sns.histplot(df1["Low"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["Low"])
plt.show()

# open
sns.histplot(df1["Close"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["Close"])
plt.show()

# open
sns.histplot(df1["Volume"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["Volume"])
plt.show()

# open
sns.histplot(df1["Market Cap"], bins=60, kde=True)
plt.show()

# Boxplot
sns.boxplot(x=df1["Market Cap"])
plt.show()


## Outlier Check (Z-value check)

## Numerical

### Start

In [None]:
# Convert Start column to datetime once
df1["Start_dt"] = pd.to_datetime(df1["Start"], format='%Y-%m-%d', errors='coerce')

# --- Year outliers ---
year = df1["Start_dt"].dt.year
mean_year = year.mean()
std_year = year.std()
z_score_year = (year - mean_year) / std_year
outliers_year = df1[z_score_year.abs() > 3]
print(f"Year outliers count: {len(outliers_year)}")

# --- Month outliers ---
month = df1["Start_dt"].dt.month
mean_month = month.mean()
std_month = month.std()
z_score_month = (month - mean_month) / std_month
outliers_month = df1[z_score_month.abs() > 3]
print(f"Month outliers count: {len(outliers_month)}")

# --- Day outliers ---
day = df1["Start_dt"].dt.day
mean_day = day.mean()
std_day = day.std()
z_score_day = (day - mean_day) / std_day
outliers_day = df1[z_score_day.abs() > 3]
print(f"Day outliers count: {len(outliers_day)}")

### End

In [None]:
mean = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.year.mean()
std = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.year.std()

z_score = (pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.year - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")

mean = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.month.mean()
std = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.month.std()

z_score = (pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.month - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")

mean = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.day.mean()
std = pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.day.std()

z_score = (pd.to_datetime(df1["End"], format='%Y-%m-%d', errors='coerce').dt.day - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")

### Open

In [None]:
mean = df1["Open"].mean()
std = df1["Open"].std()

z_score = (df1["Open"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


### High

In [None]:
mean = df1["High"].mean()
std = df1["High"].std()

z_score = (df1["High"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


### Low

In [None]:
mean = df1["Low"].mean()
std = df1["Low"].std()

z_score = (df1["Low"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


### Close

In [None]:
mean = df1["Close"].mean()
std = df1["Close"].std()

z_score = (df1["Close"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


### Volume

In [None]:
mean = df1["Volume"].mean()
std = df1["Volume"].std()

z_score = (df1["Volume"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


### Market Cap

In [None]:
mean = df1["Market Cap"].mean()
std = df1["Market Cap"].std()

z_score = (df1["Market Cap"] - mean) / std
outliers = df1[z_score.abs() > 3]
print(f"Outliers count {len(outliers)}")


# Data preprocessing
This stage include data cleaning (handling missing values, remove duplicates, correct error and inconsistencies), Data transformation (normalization, standardization, categorical data encoding) 
Feature selection, and Dimensionality reduction. Handling missing values will be approached in categorical values first. If null is found, then it will changed into modus of that certain feature. 
If outlier is found, it will removed from the dataset. In numerical values, every null value will be changed into the mean value of that feature related to that certain crime. If outlier is found,
it will be removed from the dataset. 

## Data Cleaning

### Start

In [None]:
# change all rows without the null values in very row
year_mean = int(pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.year.mean())
month_mean = int(pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.month.mean())
date_mean = int(pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.day.mean())
datetime_mean = f"{month_mean:02d}-{date_mean:02d}-{year_mean:02d}"
df1.loc[df1["Start"] == "(null)", "Start"] = datetime_mean

### End

In [None]:
year_mean = int(pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.year.mean())
month_mean = int(pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.month.mean())
date_mean = int(pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.day.mean())
datetime_mean = f"{month_mean:02d}-{date_mean:02d}-{year_mean:02d}"
df1.loc[df1["End"] == "(null)", "End"] = datetime_mean

### Open

In [None]:
mean = df1["Open"].mean()
df1.loc[df1["Open"] == "(null)", "Open"] = mean

### High

In [None]:
mean = df1["High"].mean()
df1.loc[df1["High"] == "(null)", "High"] = mean

### Low

In [None]:
mean = df1["Low"].mean()
df1.loc[df1["Low"] == "(null)", "Low"] = mean

### Close

In [None]:
mean = df1["Close"].mean()
df1.loc[df1["Close"] == "(null)", "Close"] = mean

### Volume

In [None]:
mean = df1["Volume"].mean()
df1.loc[df1["Volume"] == "(null)", "Volume"] = mean

### Market cap

In [None]:
mean = df1["Market Cap"].mean()
df1.loc[df1["Market Cap"] == "(null)", "Market Cap"] = mean

# Data Encoding
Encoding is needed to be able to get the value computed by algorithms (KNN, and many algorithms that rely on doing it using numbers). In this case, it is only needed to convert the date in the Start column into continuous value (in second relative from year 0)

In [None]:
yearToSecond = pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.year * 365 * 24 * 60 * 60
monthToSecond = pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.month * 30 * 24 * 60 * 60
dateToSecond = pd.to_datetime(df1["Start"], format="%Y-%m-%d", errors="coerce").dt.day * 24 * 60 * 60
df1["Start"] = yearToSecond + monthToSecond + dateToSecond

yearToSecond = pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.year * 365 * 24 * 60 * 60
monthToSecond = pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.month * 30 * 24 * 60 * 60
dateToSecond = pd.to_datetime(df1["End"], format="%Y-%m-%d", errors="coerce").dt.day * 24 * 60 * 60
df1["End"] = yearToSecond + monthToSecond + dateToSecond

## Feature Selection
Feature selection is done to get only the relevant column, that is the all of the column except the end date and for processing SVM, there will be a new column that will store the year and month in year-month format

In [None]:
# If your Start column is in seconds since epoch
df1["Start_dt"] = pd.to_datetime(df1["Start"], unit='s', errors='coerce')

# If your Start column is in milliseconds since epoch
# df1["Start_dt"] = pd.to_datetime(df1["Start"], unit='ms', errors='coerce')

# Now you can extract Period
df1["Period"] = df1["Start_dt"].dt.strftime("%Y-%m")

In [None]:
# listing feature and class of new column period
features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Start"
svmTarget = "Period_encoded"

classes = df1["Period"].unique()
period_to_num = {p: i for i, p in enumerate(sorted(classes))}
df1["Period_encoded"] = df1["Period"].map(period_to_num)

# Prepare X and Y
X = df1[features]
Y = np.array(df1["Start"], dtype=float)

svmY = np.array(df1[svmTarget], dtype=float)


## Dimensionality Reduction
Dimensionality reduction is done to reduce the noise of the characteristics of the attributes and
focus on what causing and the pattern on which the event emerges. In this example, there is no need to do so because every value is in simple form and it is not intended to determine a new column that is significant to the particular target label

## Balancing
Based on the diagram in the EDA, there are a pattern of skewness of the data in the time context, but
other than that, there is a nice variance (not to significant) such that the model that will generated from this dataset is perceived not be biased. Thus, it is not needed to be balanced with
any other method (balancing methods)

# Bagian 2 (Supervised Learning)

## K Nearest Neighbor

#### Hold out validation

##### Implementasi Manual

In [None]:
# partition into 80% and 20% (testing)
nTrain = int(0.8 * len(df1))
trainData = df1.iloc[:nTrain][features]
testData = df1.iloc[nTrain:][features]

# init
# "Start", "Open", "High", "Low", "Close", "Market Cap", "Volume", "Period"
openVal = float(input())
highVal = float(input())
lowVal = float(input())
closeVal = float(input())
marketCapVal = float(input())
volumeVal = float(input())

distanceFunction = (input()) 
neighborCount = int(input())
minkowskiExp = float(input())
knn = KNearestNeigbor(distanceFunction, neighborCount, "Start", minkowskiExp)

# predict
x = np.array([openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal])
prediction = knn.predict(trainData, x)
print(f"Prediction : {prediction}")

##### Implementasi Library

In [None]:
# partition into 80% and 20% (testing)
from sklearn.neighbors import KNeighborsRegressor

# X = features (all columns except target)
# y = target column
X_train = trainData.drop(columns=["Start"])
y_train = trainData["Start"]

knn = KNeighborsRegressor(n_neighbors=neighborCount, metric='minkowski', p=minkowskiExp)
knn.fit(X_train, y_train)

# Predict for a new sample
x_new = np.array([openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal]).reshape(1, -1)
predicted = knn.predict(x_new)
print("Predicted value:", predicted)


#### K fold validation

##### Implementasi Manual

In [None]:
# partition into k = user input
k = int(input())
n = len(df1)
fold_size = n // k

for i in range(k) :
    start_idx = i * fold_size
    end_idx = (i + 1) * fold_size if i != k-1 else n  # last fold may include remainder
    testData = df1.iloc[start_idx:end_idx][features]
    
    # Define train data by dropping test indices
    trainData = df1.drop(df1.index[start_idx:end_idx])[features]

    # init
    # "Open", "High", "Low", "Close", "Market Cap", "Volume"
    openVal = float(input())
    highVal = float(input())
    lowVal = float(input())
    closeVal = float(input())
    marketCapVal = float(input())
    volumeVal = float(input()) # in miliseconds
    
    distanceFunction = (input()) 
    neighborCount = int(input())
    minkowskiExp = float(input())
    knn = KNearestNeigbor(distanceFunction, neighborCount, "Start", minkowskiExp)

    # predict
    x = np.array([openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal])
    prediction = knn.predict(trainData, x)

    print(f"Fold {i+1} Prediction: {prediction}")

##### Implementasi Library

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# X = features (all columns except target)
# y = target column

# partition into k = user input
k = int(input())
n = len(df1)
fold_size = n // k

for i in range(k) :
    start_idx = i * fold_size
    end_idx = (i + 1) * fold_size if i != k-1 else n  # last fold may include remainder
    testData = df1.iloc[start_idx:end_idx][features]
    
    # Define train data by dropping test indices
    trainData = df1.drop(df1.index[start_idx:end_idx])[features]

    X_train = trainData.drop(columns=["Start"])
    y_train = trainData["Start"]

    knn = KNeighborsRegressor(n_neighbors=neighborCount, metric='minkowski', p=minkowskiExp)
    knn.fit(X_train, y_train)

    # Predict for a new sample
    x_new = np.array([openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal]).reshape(1, -1)
    predicted = knn.predict(x_new)
    print(f"Fold {i + 1} predicted value:", predicted)


## Polynomial Regression

#### Hold out validation

##### Implementasi Manual

In [None]:
# Split dataset
nTrain = int(0.8 * len(df1))
X_train = df1.iloc[:nTrain][features]
X_test = df1.iloc[nTrain:][features]

Y_train = np.array(df1.iloc[:nTrain]["Start"], dtype=float)
Y_test = np.array(df1.iloc[nTrain:]["Start"], dtype=float)

# Or for Period_encoded
svmY_train = np.array(df1.iloc[:nTrain]["Period_encoded"], dtype=float)
svmY_test = np.array(df1.iloc[nTrain:]["Period_encoded"], dtype=float)

# Initialize model
degree = 2
learning_rate = 0.000001
regularization = "l2"
model = PolynomialRegression(
    degree=degree,
    learningRate=learning_rate,
    regularizationTrem=regularization,
    features=features,
    iteration=5000
)

# Train model on continuous target (Start)
model.train(X_train, Y_train)

# Predict on test set
predictions = np.array([model.predict(x.to_numpy()) for _, x in X_test.iterrows()])

# Evaluate performance (MSE)
mse = np.mean((predictions - Y_test) ** 2)
print("Test MSE:", mse)
print(f"prediction: {predictions}")

##### Implementasi Library

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Inputs for new prediction
openVal = float(input("Open: "))
highVal = float(input("High: "))
lowVal = float(input("Low: "))
closeVal = float(input("Close: "))
marketCapVal = float(input("Market Cap: "))
volumeVal = float(input("Volume: "))

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X = df1[features]

# Hold-out split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, Y, test_size=0.2, random_state=42
)

# Train regressor
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train, y_train)

# Predict on test set
y_pred = reg.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Test MSE:", mse)
print("R2 score:", r2)

# Predict new input
new_input = np.array([[openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal]])
new_input_scaled = scaler.transform(new_input)
predicted_seconds = reg.predict(new_input_scaled)[0]

# Convert seconds since year 0 to datetime
predicted_date = pd.to_datetime(predicted_seconds, unit='s', origin='unix')  # origin can be 'unix' if seconds from 1970
print("Predicted Start (date):", predicted_date)

#### K fold validation

##### Implementasi Manual

In [None]:

# ------------------------------
# 1️⃣ Input features
# ------------------------------
features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
X = df1[features]
Y = np.array(df1["Start"], dtype=float)  # target in seconds

# ------------------------------
# 2️⃣ Define k-fold function
# ------------------------------
def k_fold_regression(model_class, X, Y, k=5, **model_kwargs):
    n = len(X)
    indices = np.arange(n)
    np.random.shuffle(indices)
    fold_size = n // k
    all_predictions = np.zeros(n)

    for fold in range(k):
        start = fold * fold_size
        end = start + fold_size if fold < k - 1 else n

        val_idx = indices[start:end]
        train_idx = np.setdiff1d(indices, val_idx)

        X_train, y_train = X.iloc[train_idx], Y[train_idx]
        X_val, y_val = X.iloc[val_idx], Y[val_idx]

        # ------------------------------
        # 3️⃣ Train model
        # ------------------------------
        model = model_class(**model_kwargs)
        model.train(X_train, y_train)

        # ------------------------------
        # 4️⃣ Test on validation fold
        # ------------------------------
        y_pred = np.array([model.predict(row.to_numpy()) for _, row in X_val.iterrows()])
        all_predictions[val_idx] = y_pred

        print(f"Fold {fold+1} MSE: {np.mean((y_val - y_pred) ** 2):.4f}")

    # Overall MSE
    mse_total = np.mean((Y - all_predictions) ** 2)
    print(f"\nOverall MSE: {mse_total:.4f}")
    return all_predictions

# ------------------------------
# 5️⃣ Train and evaluate with k-fold
# ------------------------------
degree = 2
learning_rate = 0.000001
regularization = "l2"
iteration = 5000

predictions = k_fold_regression(
    model_class=PolynomialRegression,
    X=X,
    Y=Y,
    k=5,
    degree=degree,
    learningRate=learning_rate,
    regularizationTrem=regularization,
    features=features,
    iteration=iteration
)

# ------------------------------
# 6️⃣ Predict on new input
# ------------------------------
openVal = float(input("Open: "))
highVal = float(input("High: "))
lowVal = float(input("Low: "))
closeVal = float(input("Close: "))
marketCapVal = float(input("Market Cap: "))
volumeVal = float(input("Volume: "))

new_input = np.array([openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal])
model_final = PolynomialRegression(
    degree=degree,
    learningRate=learning_rate,
    regularizationTrem=regularization,
    features=features,
    iteration=iteration
)

# Train on full dataset before predicting
model_final.train(X, Y)

predicted_seconds = model_final.predict(new_input)
predicted_date = pd.Timestamp('0000-01-01') + pd.to_timedelta(predicted_seconds, unit='s')
print("Predicted Start (date):", predicted_date)


##### Implementasi Library

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

# partition into 80% and 20% (testing)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------------------
# 2️⃣ Generate polynomial features
# ------------------------------
degree = 2
poly = PolynomialFeatures(degree=degree, include_bias=True)
X_poly = poly.fit_transform(X_scaled)

# ------------------------------
# 3️⃣ K-Fold cross-validation
# ------------------------------
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

mse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_poly), 1):
    X_train, X_val = X_poly[train_idx], X_poly[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]

    # Train
    regressor.fit(X_train, y_train)

    # Test
    y_pred = regressor.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    mse_scores.append(mse)
    print(f"Fold {fold} MSE: {mse:.4f}")

print(f"\nAverage MSE across {k} folds: {np.mean(mse_scores):.4f}")

# ------------------------------
# 4️⃣ Train on full data for final prediction
# ------------------------------
regressor.fit(X_poly, Y)

# ------------------------------
# 5️⃣ Predict new input
# ------------------------------
openVal = float(input("Open: "))
highVal = float(input("High: "))
lowVal = float(input("Low: "))
closeVal = float(input("Close: "))
marketCapVal = float(input("Market Cap: "))
volumeVal = float(input("Volume: "))

new_input = np.array([[openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal]])
new_input_scaled = scaler.transform(new_input)
new_input_poly = poly.transform(new_input_scaled)

predicted_seconds = regressor.predict(new_input_poly)[0]
predicted_date = pd.Timestamp('0000-01-01') + pd.to_timedelta(predicted_seconds, unit='s')
print("Predicted Start (date):", predicted_date)

## Regression Tree (CART)

#### Hold out validation

##### Implementasi Manual

In [None]:
# partition into 80% and 20% (testing)

n_train = int(0.8 * len(df1))
train_df = df1.iloc[:n_train]
test_df = df1.iloc[n_train:]

# Set hyperparameters
maxDepth = 5      # max depth of the tree
minSample = 5     # minimum samples per node
maxVariance = 0.05  # minimum variance reduction to continue splitting

# Initialize tree
tree = RegressionTree(features=features, maxDepth=5, minSample=5, maxVariance=0.05)

# Train
tree.root = tree.train(train_df[features + [target]])

predictions = []
for _, row in test_df.iterrows():
    x = row[features].to_dict()
    pred = tree.predict(x)
    predictions.append(pred)

y_true = test_df[target].to_numpy()
y_pred = np.array(predictions)

# Evaluate
mse = np.mean((y_true - y_pred) ** 2)
print("MSE on test set:", mse)

new_input = {
    "Open": 1000,
    "High": 1020,
    "Low": 990,
    "Close": 1010,
    "Market Cap": 5000000,
    "Volume": 100000
}

predicted_seconds = tree.predict(new_input)
predicted_date = pd.Timestamp('0000-01-01') + pd.to_timedelta(predicted_seconds, unit='s')
print("Predicted Start (date):", predicted_date)


##### Implementasi Library

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Start"  # continuous target
X = df1[features]
y = df1[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Initialize DecisionTreeRegressor
tree = DecisionTreeRegressor(
    max_depth=5,      # max depth of the tree
    min_samples_split=5,  # minimum samples per node
    random_state=42
)

# Train on the training set
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# Compute MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE on test set:", mse)

new_input = pd.DataFrame([{
    "Open": 1000,
    "High": 1020,
    "Low": 990,
    "Close": 1010,
    "Market Cap": 5000000,
    "Volume": 100000
}])

predicted_seconds = tree.predict(new_input)[0]

# Convert seconds to datetime
predicted_date = pd.Timestamp('0000-01-01') + pd.to_timedelta(predicted_seconds, unit='s')
print("Predicted Start (date):", predicted_date)


#### K fold validation

##### Implementasi Manual

In [None]:

def k_fold_cv(tree_class, df: pd.DataFrame, features: list[str], target: str, k: int = 5):
    # Shuffle the dataset
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    fold_size = len(df) // k
    mses = []

    for i in range(k):
        # Define test and train indices
        start = i * fold_size
        end = start + fold_size if i < k - 1 else len(df)
        test_df = df_shuffled.iloc[start:end]
        train_df = pd.concat([df_shuffled.iloc[:start], df_shuffled.iloc[end:]], axis=0)

        # Initialize and train tree
        tree = tree_class(features=features, maxDepth=5, minSample=5, maxVariance=0.05)
        tree.root = tree.train(train_df[features + [target]])

        # Predict on test set
        predictions = []
        for _, row in test_df.iterrows():
            x = row[features].to_dict()
            pred = tree.predict(x)
            predictions.append(pred)

        y_true = test_df[target].to_numpy()
        y_pred = np.array(predictions)
        mse = np.mean((y_true - y_pred) ** 2)
        mses.append(mse)

        print(f"Fold {i+1}: MSE = {mse}")

    avg_mse = np.mean(mses)
    print(f"Average MSE over {k} folds: {avg_mse}")
    return avg_mse

features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Start"

avg_mse = k_fold_cv(RegressionTree, df1, features, target, k=5)

##### Implementasi Library

In [None]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Start"

X = df1[features].to_numpy()
y = df1[target].to_numpy()

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold CV
mses = []

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train DecisionTreeRegressor
    tree = DecisionTreeRegressor(
        max_depth=5,
        min_samples_split=5,
        random_state=42
    )
    tree.fit(X_train, y_train)

    # Predict and compute MSE
    y_pred = tree.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mses.append(mse)

    print(f"Fold {fold + 1}: MSE = {mse}")

avg_mse = np.mean(mses)
print(f"Average MSE over {kf.get_n_splits()} folds: {avg_mse}")


new_input = pd.DataFrame([{
    "Open": 1000,
    "High": 1020,
    "Low": 990,
    "Close": 1010,
    "Market Cap": 5000000,
    "Volume": 100000
}])

# Train on full dataset to make final prediction
final_tree = DecisionTreeRegressor(max_depth=5, min_samples_split=5, random_state=42)
final_tree.fit(X, y)
predicted_seconds = final_tree.predict(new_input)[0]

# Convert seconds to datetime
predicted_date = pd.Timestamp('0000-01-01') + pd.to_timedelta(predicted_seconds, unit='s')
print("Predicted Start (date):", predicted_date)


## Support Vector Machine (SVM)

#### Hold out validation

##### Implementasi Manual

In [None]:
# partition into 80% and 20% (testing)
features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
targetSVM = "Period"  # your target column

X = df1[features]
y = df1[targetSVM]

nTrain = int(0.8 * len(df1))

trainData = df1.iloc[:nTrain]
testData = df1.iloc[nTrain:]

svm = SupportVectorMachine(
    dataset=trainData,
    target=targetSVM,
    alpha=1.0,
    learningRate=0.001,
    regularizationTerm="l2",
    n=10000  # number of iterations
)

# Train the SVM on the training set
svm.train()

predictions = []
for i in range(len(testData)):
    x_row = testData.iloc[i]
    pred = svm.predict(x_row)
    predictions.append(pred)

# Add predictions to the test dataframe
testData["Predicted"] = predictions

accuracy = np.mean(testData["Predicted"] == testData[targetSVM])
print("Hold-out accuracy:", accuracy)


##### Implementasi Library

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Period"

X = df1[features]
y = df1[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use a C-SVM with RBF kernel (you can also use 'linear' or 'poly')
svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr', random_state=42)
svm_clf.fit(X_train_scaled, y_train)

y_pred = svm_clf.predict(X_test_scaled)

print("Hold-out Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Example new input
openVal = float(input())
highVal = float(input())
lowVal = float(input())
closeVal = float(input())
marketCapVal = float(input())
volumeVal = float(input())
new_input = np.array([[openVal, highVal, lowVal, closeVal, marketCapVal, volumeVal]])  # replace with real values
new_input_scaled = scaler.transform(new_input)

predicted_period = svm_clf.predict(new_input_scaled)
print("Predicted Period:", predicted_period[0])

#### K fold validation

##### Implementasi Manual

In [None]:

def k_fold_manual_svm(dataset: pd.DataFrame, target: str, k: int = 5, alpha=1.0, learningRate=0.001, regularizationTerm="l2", n=10000):
    dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle
    fold_size = len(dataset) // k
    accuracies = []

    for fold in range(k):
        start = fold * fold_size
        end = start + fold_size if fold != k-1 else len(dataset)
        test_df = dataset.iloc[start:end]
        train_df = pd.concat([dataset.iloc[:start], dataset.iloc[end:]]).reset_index(drop=True)

        # Initialize and train SVM
        svm = SupportVectorMachine(
            dataset=train_df,
            target=target,
            alpha=alpha,
            learningRate=learningRate,
            regularizationTerm=regularizationTerm,
            n=n
        )
        svm.train()

        # Predict on test fold
        correct = 0
        for i in range(len(test_df)):
            x_row = test_df.iloc[i]
            pred = svm.predict(x_row)
            if pred == x_row[target]:
                correct += 1

        accuracy = correct / len(test_df)
        accuracies.append(accuracy)
        print(f"Fold {fold+1}: Accuracy = {accuracy:.4f}")

    print(f"\nAverage Accuracy over {k}-folds: {np.mean(accuracies):.4f}")

features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
targetSVM = "Period"

k_fold_manual_svm(df1, targetSVM, k=5, alpha=1.0, learningRate=0.001, regularizationTerm="l2", n=10000)


##### Implementasi Library

In [None]:

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

features = ["Open", "High", "Low", "Close", "Market Cap", "Volume"]
target = "Period"

X = df1[features]
y = df1[target]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 5  # number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr', random_state=42)

scores = cross_val_score(svm_clf, X_scaled, y, cv=kf, scoring='accuracy')

for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

print(f"\nAverage Accuracy over {k} folds: {np.mean(scores):.4f}")


## Artificial Neural Network (ANN)

#### Hold out validation

##### Implementasi Manual

In [None]:
# partition into 80% and 20% (testing)

##### Implementasi Library

In [None]:
# partition into 80% and 20% (testing)

#### K fold validation

##### Implementasi Manual

In [None]:
# partition into 80% and 20% (testing)

##### Implementasi Library

In [None]:
# partition into 80% and 20% (testing)

# Bagian 3 (Unsupervised Learning)

Bagian ini akan menggunakan data iris.csv yang terletak pada folder data dengan kolom sebagai berikut: 
1. sepal_width : numerical
2. sepal_length : numerical
3. petal_width : numerical
4. petal_length : numerical
5. class : categorical

## K Means

### Implementasi manual

### Implementasi Library

## DBSCAN

### Implementasi manual

### Implementasi Library

## PCA

### Implementasi manual

### Implementasi Library