<a href="https://colab.research.google.com/github/Deleon57/EDAR-Data/blob/main/KNN222(quartile).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Get Data

In [2]:
ticker = "GFI.JO"
data = yf.download(ticker, start="2013-01-01", end="2024-12-31")

data.reset_index(inplace=True)
data.head(10)

  data = yf.download(ticker, start="2013-01-01", end="2024-12-31")
[*********************100%***********************]  1 of 1 completed


Price,Date,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,GFI.JO,GFI.JO,GFI.JO,GFI.JO,GFI.JO
0,2013-01-01,9091.96582,9091.96582,9091.96582,9091.96582,0
1,2013-01-02,9323.31543,9325.943574,9043.76485,9151.553832,2277190
2,2013-01-03,9335.586914,9376.775152,9183.981508,9289.141412,1638359
3,2013-01-04,9049.025391,9131.400891,8955.25769,9113.874727,2154183
4,2013-01-07,9104.236328,9156.8158,9000.828832,9083.204345,1833898
5,2013-01-08,8928.09082,9148.050381,8900.92488,9013.971154,2221997
6,2013-01-09,9006.961914,9037.633676,8860.614741,9020.983235,2344840
7,2013-01-10,9091.96582,9091.96582,8906.182845,8982.42438,1564839
8,2013-01-11,9201.503906,9240.063009,9112.994478,9113.871176,3899959
9,2013-01-14,9289.140625,9332.081281,9245.324246,9245.324246,2329399


# Feature engineer

In [3]:
df = data.copy()

# 1. Spreads
df['High_Low'] = df['High'] - df['Low']
df['Open_Close'] = df['Open'] - df['Close']

# 2. Moving Averages
df['SMA_5'] = df['Close'].rolling(5).mean()
df['SMA_10'] = df['Close'].rolling(10).mean()
df['SMA_20'] = df['Close'].rolling(20).mean()
df['SMA_50'] = df['Close'].rolling(50).mean()
df['SMA_100'] = df['Close'].rolling(100).mean()
df['SMA_200'] = df['Close'].rolling(200).mean()

# 3. Exponential Moving Averages
df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()
df['EMA_50'] = df['Close'].ewm(span=50, adjust=False).mean()

# 4. Rolling Std (Volatility)
df['RollingStd_20'] = df['Close'].rolling(20).std()

# 5. Lag Features (Close & Volume)
for lag in range(1, 6):
    df[f'Close_t-{lag}'] = df['Close'].shift(lag)
    df[f'Volume_t-{lag}'] = df['Volume'].shift(lag)
    df[f'Return_t-{lag}'] = df['Close'].pct_change(lag)

# 6. Relative Strength Index (RSI)
window_length = 14
delta = df['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=window_length).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=window_length).mean()
rs = gain / loss
df['RSI_14'] = 100 - (100 / (1 + rs))

# 7. On-Balance Volume (OBV)
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

# 8. Moving Average Convergence Divergence (MACD)
df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

# 9. Bollinger Bands (20-day)
df['BB_Middle'] = df['Close'].rolling(window=20).mean()

# 10. Average True Range (ATR)
df['H-L'] = df['High'] - df['Low']
df['H-C'] = abs(df['High'] - df['Close'].shift())
df['L-C'] = abs(df['Low'] - df['Close'].shift())
df['TR'] = df[['H-L', 'H-C', 'L-C']].max(axis=1)
df['ATR_14'] = df['TR'].rolling(14).mean()

# Drop NA rows
df.dropna(inplace=True)

# Target: 1 if next-day close > today’s close, else 0
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

df.dropna(inplace=True)
df.head(10)

Price,Date,Close,High,Low,Open,Volume,High_Low,Open_Close,SMA_5,SMA_10,...,EMA_26,MACD,Signal_Line,BB_Middle,H-L,H-C,L-C,TR,ATR_14,Target
Ticker,Unnamed: 1_level_1,GFI.JO,GFI.JO,GFI.JO,GFI.JO,GFI.JO,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
199,2013-10-07,4485.15332,4575.933781,4452.232933,4479.167795,1173121,123.700848,-5.985525,4500.716309,4591.396973,...,4838.440401,-222.953717,-231.006139,4695.54519,123.700848,80.803898,42.896949,123.700848,162.749331,0
200,2013-10-08,4451.235352,4556.979625,4449.240177,4556.979625,1274177,107.739448,105.744273,4497.124805,4562.067725,...,4809.758545,-219.541298,-228.71317,4660.230542,107.739448,71.826305,35.913144,107.739448,163.533149,0
201,2013-10-09,4380.407715,4470.19061,4355.468022,4464.205084,1392042,114.722589,83.797369,4467.99541,4531.242285,...,4777.95478,-220.015922,-226.973721,4627.858862,114.722589,18.955259,95.76733,114.722589,146.85921,1
202,2013-10-10,4442.257812,4468.195091,4301.597954,4375.41944,1802995,166.597138,-66.838373,4450.836816,4506.40249,...,4753.088338,-212.946564,-224.168289,4609.902295,166.597138,87.787377,78.809761,166.597138,139.662361,0
203,2013-10-11,4336.513184,4429.288828,4302.595206,4406.344314,1050501,126.693622,69.83113,4419.113477,4471.187598,...,4722.230919,-213.416621,-222.017956,4590.299707,126.693622,12.968985,139.662606,139.662606,134.959469,1
204,2013-10-14,4399.361816,4436.272562,4279.651291,4321.549975,2032833,156.621271,-77.811842,4401.955176,4451.335742,...,4698.314689,-206.339234,-218.882211,4578.079297,156.621271,99.759378,56.861893,156.621271,146.146703,0
205,2013-10-15,4336.513184,4406.344314,4271.669991,4372.426336,2081226,134.674322,35.913153,4379.010742,4438.067773,...,4671.514578,-203.456395,-215.797048,4561.519336,134.674322,6.982497,127.691825,134.674322,143.225137,1
206,2013-10-16,4369.433105,4488.146014,4344.493419,4366.440343,2451447,143.652595,-2.992762,4376.81582,4422.405615,...,4649.138172,-196.253079,-211.888254,4545.557886,143.652595,151.63283,7.980235,151.63283,143.723898,1
207,2013-10-17,4424.299805,4424.299805,4289.625515,4329.529008,7904643,134.674289,-94.770796,4373.224219,4412.030518,...,4632.483478,-183.996121,-206.309828,4521.665601,134.674289,54.866699,79.80759,134.674289,142.940076,1
208,2013-10-18,4426.295898,4483.158388,4384.397222,4461.211462,2045668,98.761166,34.915564,4391.180762,4405.147119,...,4617.210324,-172.137038,-199.47527,4511.240771,98.761166,58.858583,39.902583,98.761166,137.310834,1


In [None]:
# --- Export full dataset with all features ---
df.to_csv("GFI_features.csv", index=False)

# If using Google Colab, download the file
from google.colab import files
files.download("GFI_features.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# SCALED OF ALL THE DATA

In [7]:
from sklearn.preprocessing import QuantileTransformer
import numpy as np

# --- Feature groups ---
price_cols = [
    'Open','High','Low','Close',
    'SMA_5','SMA_10','SMA_20','SMA_50','SMA_100','SMA_200',
    'EMA_5','EMA_12','EMA_20','EMA_26','EMA_50','BB_Middle'
]

volume_cols = ['Volume']   # log + scale
obv_cols = ['OBV']         # can be negative, scale only

relative_cols = [
    'High_Low','Open_Close',
    'Return_t-1','Return_t-2','Return_t-3','Return_t-4','Return_t-5',
    'MACD','Signal_Line','ATR_14','H-L','H-C','L-C','TR',
    'RollingStd_20'
]

bounded_cols = ['RSI_14']  # already in 0–100

# --- NEW groups for lag features ---
close_lag_cols = ['Close_t-1','Close_t-2','Close_t-3','Close_t-4','Close_t-5']
volume_lag_cols = ['Volume_t-1','Volume_t-2','Volume_t-3','Volume_t-4','Volume_t-5']

# --- Copy full set ---
df_scaled = df.copy()

# 1. Prices & MAs -> Quantile uniform 0–100
price_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[price_cols] = price_scaler.fit_transform(df[price_cols]) * 100

# 2. Volume -> log + Quantile uniform 0–100
df_scaled['Volume'] = np.log1p(df['Volume'])
vol_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[['Volume']] = vol_scaler.fit_transform(df_scaled[['Volume']]) * 100

# 3. OBV -> Quantile uniform 0–100
obv_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[['OBV']] = obv_scaler.fit_transform(df[['OBV']]) * 100

# 4. Relative cols -> Quantile uniform 0–100
rel_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[relative_cols] = rel_scaler.fit_transform(df[relative_cols]) * 100

# 5. RSI already 0–100 → just copy
df_scaled[bounded_cols] = df[bounded_cols]

# --- EXTRA FIX for lags ---

# Close lags
close_lag_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[close_lag_cols] = close_lag_scaler.fit_transform(df[close_lag_cols]) * 100

# Volume lags
volume_lag_scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
df_scaled[volume_lag_cols] = volume_lag_scaler.fit_transform(df[volume_lag_cols]) * 100

# --- Add Target as the last column ---
df_scaled['Target'] = df['Target'].values

# --- Preview ---
print("Scaled dataset sample (Quantile 0–100):")
display(df_scaled.head())


Scaled dataset sample (Quantile 0–100):


Price,Date,Close,High,Low,Open,Volume,High_Low,Open_Close,SMA_5,SMA_10,...,EMA_26,MACD,Signal_Line,BB_Middle,H-L,H-C,L-C,TR,ATR_14,Target
Ticker,Unnamed: 1_level_1,GFI.JO,GFI.JO,GFI.JO,GFI.JO,GFI.JO,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
199,2013-10-07,20.014123,20.414395,21.218065,19.963967,8.813939,13.212812,49.349511,20.584374,22.126218,...,30.854558,19.445182,18.033189,25.77475,13.212812,32.231537,17.015941,10.20943,15.12623,0
200,2013-10-08,19.544815,19.942921,21.176162,21.845111,10.989794,8.508308,75.930765,20.465646,21.437156,...,30.475374,19.881655,18.129048,24.512001,8.508308,29.226298,14.414094,6.885225,15.309441,0
201,2013-10-09,17.826815,18.220606,19.090907,19.49355,14.32667,10.410084,71.923504,19.822998,20.718951,...,29.232708,19.837755,18.262452,23.124829,10.410084,10.104567,36.935463,8.202265,9.100789,1
202,2013-10-10,19.313692,18.179853,17.813232,17.844206,28.581733,26.123802,34.835226,19.519895,20.231997,...,27.531268,20.260572,18.528047,22.322388,26.123802,34.333253,31.730902,22.921673,6.844496,0
203,2013-10-11,16.716717,17.44139,17.852124,18.472016,6.329877,14.014014,68.966479,18.689532,19.625656,...,26.559102,20.235454,18.714007,21.772799,14.014014,7.724841,47.746603,15.114691,5.46757,1


In [8]:
# Save scaled dataframe to CSV
output_file = "scaled_data.csv"
df_scaled.to_csv(output_file, index=False)

# Download the file in Colab
from google.colab import files
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Preprocessing the data with scaled

In [4]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# --- Feature groups ---
price_cols = [
    'Open','High','Low','Close',
    'SMA_5','SMA_10','SMA_20','SMA_50','SMA_100','SMA_200',
    'EMA_5','EMA_12','EMA_20','EMA_26','EMA_50','BB_Middle'
]

volume_cols = ['Volume']   # log + scale
obv_cols = ['OBV']         # can be negative, scale only

relative_cols = [
    'High_Low','Open_Close',
    'Return_t-1','Return_t-2','Return_t-3','Return_t-4','Return_t-5',
    'MACD','Signal_Line','ATR_14','H-L','H-C','L-C','TR'
]

bounded_cols = ['RSI_14']  # already in 0–100

# --- NEW groups for problem features ---
close_lag_cols = ['Close_t-1','Close_t-2','Close_t-3','Close_t-4','Close_t-5']
volume_lag_cols = ['Volume_t-1','Volume_t-2','Volume_t-3','Volume_t-4','Volume_t-5']
rolling_cols = ['SMA_20']

# --- Copy full sets ---
X_train_scaled = X_train_full.copy()
X_test_scaled = X_test_full.copy()

# 1. Prices & MAs -> MinMax 0–100
price_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[price_cols] = price_scaler.fit_transform(X_train_full[price_cols])
X_test_scaled[price_cols]  = price_scaler.transform(X_test_full[price_cols])

# 2. Volume -> log + MinMax 0–100
X_train_scaled['Volume'] = np.log1p(X_train_full['Volume'])
X_test_scaled['Volume']  = np.log1p(X_test_full['Volume'])

vol_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[['Volume']] = vol_scaler.fit_transform(X_train_scaled[['Volume']])
X_test_scaled[['Volume']]  = vol_scaler.transform(X_test_scaled[['Volume']])

# 3. OBV -> MinMax directly
obv_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[['OBV']] = obv_scaler.fit_transform(X_train_full[['OBV']])
X_test_scaled[['OBV']]  = obv_scaler.transform(X_test_full[['OBV']])

# 4. Relative cols -> MinMax 0–100
rel_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[relative_cols] = rel_scaler.fit_transform(X_train_full[relative_cols])
X_test_scaled[relative_cols]  = rel_scaler.transform(X_test_full[relative_cols])

# 5. RSI already 0–100 → just copy
X_train_scaled[bounded_cols] = X_train_full[bounded_cols]
X_test_scaled[bounded_cols]  = X_test_full[bounded_cols]

# --- EXTRA FIX for lags & rolling averages ---

# Close lags
close_lag_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[close_lag_cols] = close_lag_scaler.fit_transform(X_train_full[close_lag_cols])
X_test_scaled[close_lag_cols]  = close_lag_scaler.transform(X_test_full[close_lag_cols])

# Volume lags
volume_lag_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[volume_lag_cols] = volume_lag_scaler.fit_transform(X_train_full[volume_lag_cols])
X_test_scaled[volume_lag_cols]  = volume_lag_scaler.transform(X_test_full[volume_lag_cols])

# Rolling averages (like SMA20)
roll_scaler = MinMaxScaler(feature_range=(0,100))
X_train_scaled[rolling_cols] = roll_scaler.fit_transform(X_train_full[rolling_cols])
X_test_scaled[rolling_cols]  = roll_scaler.transform(X_test_full[rolling_cols])

# --- Preview ---
print("Scaled training data sample:")
display(X_train_scaled.head())


NameError: name 'X_train_full' is not defined

In [5]:
# --- Export scaled training data ---
X_train_scaled.to_csv("GFI_train_scaled.csv", index=False)

# --- Export scaled testing data ---
X_test_scaled.to_csv("GFI_test_scaled.csv", index=False)

# Download both (Colab)
from google.colab import files
files.download("GFI_train_scaled.csv")
files.download("GFI_test_scaled.csv")


NameError: name 'X_train_scaled' is not defined

#Train and split data

In [6]:
train = df[df['Date'] < "2024-01-01"]
test = df[df['Date'] >= "2024-01-01"]

feature_pool = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'SMA_5','SMA_10','SMA_20','SMA_50','SMA_100','SMA_200',
    'EMA_5','EMA_20','EMA_50',
    'RollingStd_20','High_Low','Open_Close',
    'Close_t-1','Close_t-2','Close_t-3','Close_t-4','Close_t-5',
    'Volume_t-1','Volume_t-2','Volume_t-3','Volume_t-4','Volume_t-5',
    'Return_t-1','Return_t-2','Return_t-3','Return_t-4','Return_t-5',
    'RSI_14',
    'OBV','EMA_12','EMA_26','MACD','Signal_Line','BB_Middle',
    'H-L','H-C','L-C','TR','ATR_14'
]
X_train_full = train[feature_pool]
y_train = train['Target']
X_test_full = test[feature_pool]
y_test = test['Target']


# Standardize feature

# Search for best combination of features that will have highest accuracy

In [None]:
import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

best_score = 0
best_features = None
best_model = None

random.seed(42)  # reproducibility

for _ in range(300):
    r = random.randint(4, 5)  # choose random number of features
    combo = random.sample(feature_pool, r)  # choose random subset

    # use scaled data
    X_train = X_train_scaled[list(combo)]
    X_test = X_test_scaled[list(combo)]

    # train KNN
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)

    # evaluate
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    # track best
    if acc > best_score:
        best_score = acc
        best_features = combo
        best_model = model

print("Best Accuracy:", best_score)
print("Best Feature Set:", best_features)


# Confusion matrix

In [None]:
X_test_best = X_test_full[list(best_features)]
y_pred = best_model.predict(X_test_best)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Down","Up"], yticklabels=["Down","Up"])
plt.title(f"Confusion Matrix (Accuracy: {best_score:.2%})")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()


In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model,
    X_train_full[list(best_features)],
    y_train,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, label="Training score", color="blue")
plt.plot(train_sizes, test_mean, label="Cross-validation score", color="orange")

plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=0.2, color="blue")
plt.fill_between(train_sizes, test_mean-test_std, test_mean+test_std, alpha=0.2, color="orange")

plt.title("Learning Curve (KNN Classifier)")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.legend(loc="best")
plt.grid()
plt.show()


# Search for best value of k

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

k_values = range(1, 21)
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    scores.append(acc)

plt.figure(figsize=(8,5))
plt.plot(k_values, scores, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('KNN Accuracy for different k values')
plt.xticks(k_values)
plt.grid(True)
plt.show()

best_k = k_values[scores.index(max(scores))]
