In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import ttest_ind, f_oneway
import os

2025-02-17 13:48:05.148970: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-17 13:48:05.149091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-17 13:48:05.294135: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Define file path
file_path = "/kaggle/input/time-series-forecasting-with-yahoo-stock-price/yahoo_stock.csv"

In [3]:
# Load dataset
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-11-23,2095.610107,2081.389893,2089.409912,2086.590088,3587980000.0,2086.590088
1,2015-11-24,2094.120117,2070.290039,2084.419922,2089.139893,3884930000.0,2089.139893
2,2015-11-25,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
3,2015-11-26,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
4,2015-11-27,2093.290039,2084.129883,2088.820068,2090.110107,1466840000.0,2090.110107


In [5]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])


In [6]:
# Set Date as index
df.set_index('Date', inplace=True)

In [7]:
# EDA: Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 High         0
Low          0
Open         0
Close        0
Volume       0
Adj Close    0
dtype: int64


In [8]:
# EDA: Descriptive statistics
df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,1825.0,1825.0,1825.0,1825.0,1825.0,1825.0
mean,2660.718673,2632.81758,2647.704751,2647.856284,3869627000.0,2647.856284
std,409.680853,404.310068,407.169994,407.301177,1087593000.0,407.301177
min,1847.0,1810.099976,1833.400024,1829.079956,1296540000.0,1829.079956
25%,2348.350098,2322.25,2341.97998,2328.949951,3257950000.0,2328.949951
50%,2696.25,2667.840088,2685.48999,2683.340088,3609740000.0,2683.340088
75%,2930.790039,2900.709961,2913.860107,2917.52002,4142850000.0,2917.52002
max,3645.98999,3600.159912,3612.090088,3626.909912,9044690000.0,3626.909912


In [9]:
import plotly.io as pio
pio.renderers.default = "iframe"

In [10]:
# EDA: Composition
# Plotting closing price over time
fig = px.line(df, x=df.index, y='Close', title='Closing Price Over Time')
fig.update_layout(title_text='Closing Price Over Time', title_x=0.5, template='plotly_dark')
fig.show()

In [11]:
# EDA: Distribution
fig = make_subplots(rows=3, cols=2, subplot_titles=('Close', 'Volume', 'Open', 'High', 'Low', 'Adj Close'))
fig.add_trace(go.Histogram(x=df['Close'], name='Close'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Volume'], name='Volume'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Open'], name='Open'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['High'], name='High'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['Low'], name='Low'), row=3, col=1)
fig.add_trace(go.Histogram(x=df['Adj Close'], name='Adj Close'), row=3, col=2)
fig.update_layout(title_text='Distribution of Features', title_x=0.5, template='plotly_dark')
fig.show()

In [12]:
# EDA: Relationship
fig = px.scatter_matrix(df, dimensions=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], title='Scatter Matrix')
fig.update_layout(title_text='Scatter Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [13]:
# EDA: Comparison
# Correlation matrix
correlation = df.corr()
fig = px.imshow(correlation, text_auto=True, title='Correlation Matrix')
fig.update_layout(title_text='Correlation Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [14]:
# Statistical Tests
# T-test comparing 'High' and 'Low' prices
t_stat, p_value = ttest_ind(df['High'], df['Low'])
t_test_result = {
    'Statistic': [t_stat],
    'p-value': [p_value]
}
t_test_df = pd.DataFrame(t_test_result)
t_test_df

Unnamed: 0,Statistic,p-value
0,2.070802,0.038447


In [15]:
# Display T-test result
fig = px.bar(t_test_df, x=t_test_df.index, y=['Statistic', 'p-value'], barmode='group', title='T-test Result: High vs Low Prices')
fig.update_layout(title_text='T-test Result: High vs Low Prices', title_x=0.5, template='plotly_dark')
fig.show()





In [70]:
# ANOVA test for 'Open', 'High', 'Low', 'Close' prices
anova_stat, anova_p_value = f_oneway(df['Open'], df['High'], df['Low'], df['Close'])
anova_result = {
    'Statistic': [anova_stat],
    'p-value': [anova_p_value]
}
anova_df = pd.DataFrame(anova_result)
anova_df

Unnamed: 0,Statistic,p-value
0,1.432399,0.231203


In [71]:
# Display ANOVA result
fig = px.bar(anova_df, x=anova_df.index, y=['Statistic', 'p-value'], barmode='group', title='ANOVA Test Result: Open, High, Low, Close Prices')
fig.update_layout(title_text='ANOVA Test Result: Open, High, Low, Close Prices', title_x=0.5, template='plotly_dark')
fig.show()





In [16]:
# Data preprocessing: Feature scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['Close'].values.reshape(-1, 1))

In [17]:
# Prepare the data for LSTM
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

# Define time step
time_step = 60

In [18]:
train_size = int(len(scaled_data) * 0.7)  # 70% pour l'entraînement
val_size = int(len(scaled_data) * 0.1)    # 10% pour la validation
test_size = len(scaled_data) - train_size - val_size  # 20% pour le test

# Création des ensembles de données
train_data = scaled_data[0:train_size, :]
val_data = scaled_data[train_size:train_size + val_size, :]
test_data = scaled_data[train_size + val_size:, :]

# Création des datasets pour LSTM
X_train, y_train = create_dataset(train_data, time_step)
X_val, y_val = create_dataset(val_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

# Reshape des entrées pour LSTM [samples, time steps, features]
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


In [32]:
import optuna
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Fonction d'objectif pour Optuna
def objective(trial):
    # Définition des hyperparamètres à optimiser
    num_layers = trial.suggest_int("num_layers", 1, 3)  # Nombre de couches LSTM
    units = trial.suggest_int("units", 10, 100)  # Nombre de neurones par couche
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)  # Dropout
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)  # Taux d'apprentissage
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])  # Batch size
    
    # Construction du modèle LSTM
    model = Sequential()
    for i in range(num_layers):
        return_sequences = i < (num_layers - 1)  # True sauf pour la dernière couche
        model.add(LSTM(units, return_sequences=return_sequences, input_shape=(time_step, 1)))
        model.add(Dropout(dropout_rate))

    model.add(Dense(25, activation="relu"))
    model.add(Dense(1))

    # Compilation du modèle
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Entraînement du modèle
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=0, validation_data=(X_val, y_val))

    # Retourne la perte sur les données de validation
    return history.history["val_loss"][-1]

# Lancer l'optimisation
study = optuna.create_study(direction="minimize")  # On veut minimiser la perte
study.optimize(objective, n_trials=20)

# Meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres :", study.best_params)


[I 2025-02-17 14:03:05,941] A new study created in memory with name: no-name-81b07292-5239-4939-b8d8-20ea31e1635f

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[W 2025-02-17 14:03:06,031] Trial 0 failed with parameters: {'num_layers': 3, 'units': 33, 'dropout_rate': 0.35326005858196696, 'learning_rate': 4.244855075963829e-05, 'batch_size': 16} because of the following error: ValueError("Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1216\n'y' sizes: 1\n").
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_34/3563206081.py", line 31, in objective
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=0, validation_data=(X_val,

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1216
'y' sizes: 1


In [20]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, 1)))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [21]:
# Train the model
history = model.fit(X_train, y_train, batch_size=1, epochs=10)

Epoch 1/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0070
Epoch 2/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0015
Epoch 3/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0011
Epoch 4/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0013
Epoch 5/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0012
Epoch 6/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0011
Epoch 7/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0010
Epoch 8/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 9.5704e-04
Epoch 9/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0010
Epoch 10/10
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [22]:
# Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [23]:
# Inverse transform to get actual values
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
y_train = scaler.inverse_transform([y_train])
y_test = scaler.inverse_transform([y_test])

In [None]:
# Calculate RMSE and MAE
train_rmse = np.sqrt(mean_squared_error(y_train[0], train_predict[:, 0]))
train_mae = mean_absolute_error(y_train[0], train_predict[:, 0])
test_rmse = np.sqrt(mean_squared_error(y_test[0], test_predict[:, 0]))
test_mae = mean_absolute_error(y_test[0], test_predict[:, 0])

print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

In [None]:
# Plotting the results
train_data_len = len(train_data)

train_plot = np.empty_like(scaled_data)
train_plot[:, :] = np.nan
train_plot[time_step:len(train_predict)+time_step, :] = train_predict

test_plot = np.empty_like(scaled_data)
test_plot[:, :] = np.nan
test_plot[len(train_predict)+(time_step*2)+1:len(scaled_data)-1, :] = test_predict

In [None]:
# Create plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df.index, y=train_plot[:, 0], mode='lines', name='Train Predict', line=dict(color='green')))
fig.add_trace(go.Scatter(x=df.index, y=test_plot[:, 0], mode='lines', name='Test Predict', line=dict(color='red')))
fig.update_layout(title='Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()


In [None]:
def predict_future(model, data, scaler, time_step=60, future_steps=30):
    # Prepare the input data for future prediction
    last_data = data[-time_step:]
    last_data = last_data.reshape(1, time_step, 1)

    # Predict future prices
    future_predictions = []
    for _ in range(future_steps):
        next_pred = model.predict(last_data)
        future_predictions.append(next_pred[0, 0])
        last_data = np.append(last_data[:, 1:, :], [[[next_pred[0, 0]]]], axis=1)

    # Inverse transform the predictions to get actual values
    future_predictions = np.array(future_predictions).reshape(-1, 1)
    future_predictions = scaler.inverse_transform(future_predictions)

    return future_predictions

# Example usage
future_steps = 30
future_predictions = predict_future(model, scaled_data, scaler, time_step, future_steps)

# Create future dates for plotting
last_date = df.index[-1]
future_dates = pd.date_range(start=last_date, periods=future_steps + 1, inclusive='right')

# Plotting future predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=future_dates, y=future_predictions[:, 0], mode='lines', name='Future Predict', line=dict(color='orange')))
fig.update_layout(title='Future Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()


In [None]:
# Save output in /kaggle/working
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, 'yahoo_stock_processed.csv'))

In [None]:

# Save the model
model.save(os.path.join(output_dir, 'stock_price_lstm_model.h5'))