<h1 align="center">Part 1: Preprocessing, Feature Engineering, and Predictive Pipeline for Gold Futures Dataset</h1>


In [None]:
import skfuzzy as fuzz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU, BatchNormalization
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from keras import initializers
# Compile the GAN
from keras.models import Model
from keras.layers import Input

In [None]:
df=pd.read_excel("gold_futures.xlsx")

In [None]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Price Movement
0,2020-01-02,1518.099976,1528.699951,1518.000000,1524.500000,1524.500000,214,Up
1,2020-01-03,1530.099976,1552.699951,1530.099976,1549.199951,1549.199951,107,Up
2,2020-01-06,1580.000000,1580.000000,1560.400024,1566.199951,1566.199951,416,Up
3,2020-01-07,1558.300049,1576.300049,1558.300049,1571.800049,1571.800049,47,Up
4,2020-01-08,1579.699951,1604.199951,1552.300049,1557.400024,1557.400024,236,Down
...,...,...,...,...,...,...,...,...
1240,2024-12-10,2662.300049,2698.199951,2661.000000,2697.600098,2697.600098,437,Up
1241,2024-12-11,2701.800049,2733.800049,2693.100098,2733.800049,2733.800049,3387,Up
1242,2024-12-12,2725.100098,2725.100098,2677.399902,2687.500000,2687.500000,2365,Down
1243,2024-12-13,2688.199951,2689.300049,2647.899902,2656.000000,2656.000000,1125,Down


# Preprocessing the Data

In [None]:
df=df.dropna()

In [None]:
df.isnull().sum()

Date              0
Open              0
High              0
Low               0
Close             0
Adj Close         0
Volume            0
Price Movement    0
dtype: int64

In [None]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Price Movement'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1245 entries, 0 to 1244
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            1245 non-null   datetime64[ns]
 1   Open            1245 non-null   float64       
 2   High            1245 non-null   float64       
 3   Low             1245 non-null   float64       
 4   Close           1245 non-null   float64       
 5   Adj Close       1245 non-null   float64       
 6   Volume          1245 non-null   int64         
 7   Price Movement  1245 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 77.9+ KB


In [None]:
df['Date'] = pd.to_datetime(df['Date'])

df.fillna(method='ffill', inplace=True)
df['Price_Change'] = df['Close'] - df['Open']  # Price change in a day
df['Price_Movement'] = np.where(df['Price_Change'] > 0, 1, 0)  # Label: Up=1, Down=0
df['Volatility'] = df['High'] - df['Low']  # Volatility of the day
df['Moving_Avg'] = df['Close'].rolling(window=5).mean()  # 5-Day Moving Average


df.dropna(inplace=True)

X = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Volatility', 'Moving_Avg']]
y = df['Price_Movement']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

  df.fillna(method='ffill', inplace=True)


# Machine Learning Pipeline for Classification


In [None]:
mi = mutual_info_classif(X_train, y_train)
mi_df = pd.DataFrame({'Feature': X.columns, 'Importance': mi})
mi_df = mi_df.sort_values(by='Importance', ascending=False)


rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000)


rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)


print("Random Forest Classifier Report:")
print(classification_report(y_test, y_pred_rf))
print("Logistic Regression Classifier Report:")
print(classification_report(y_test, y_pred_lr))


rf_accuracy = accuracy_score(y_test, y_pred_rf)
lr_accuracy = accuracy_score(y_test, y_pred_lr)

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Logistic Regression Accuracy: {lr_accuracy}")


best_model = rf_classifier if rf_accuracy > lr_accuracy else lr_classifier
print(f"Best Model: {'Random Forest' if best_model == rf_classifier else 'Logistic Regression'}")

Random Forest Classifier Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       133
           1       0.79      0.74      0.76       116

    accuracy                           0.79       249
   macro avg       0.79      0.78      0.79       249
weighted avg       0.79      0.79      0.79       249

Logistic Regression Classifier Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       133
           1       0.89      0.88      0.89       116

    accuracy                           0.90       249
   macro avg       0.90      0.89      0.89       249
weighted avg       0.90      0.90      0.90       249

Random Forest Accuracy: 0.7871485943775101
Logistic Regression Accuracy: 0.8955823293172691
Best Model: Logistic Regression


# Summary of Steps:
**Preprocessing:** Clean and feature-engineer the dataset (handling missing values, creating new features like price change and volatility).
**GAN:** Build and train a GAN for data augmentation, experimenting with different parameters.
**Modeling:** Train machine learning models (Random Forest and Logistic Regression) with advanced feature engineering.
**Evaluation:** Compare the performance of models with and without feature engineering and select the best-performing model.

# Best Model Logistic Regresssor
Logistic Regression Classifier Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       133
           1       0.89      0.88      0.89       116

    accuracy                           0.90       249
   macro avg       0.90      0.89      0.89       249
weighted avg       0.90      0.90      0.90       249

# Random Forest Accuracy: 0.7871485943775101
# Logistic Regression Accuracy: 0.8955823293172691
# Best Model: Logistic Regression