import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Preprocessing data

# Convert to datetime
stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce', utc=True)
currency_df['Date'] = pd.to_datetime(currency_df['Date'], utc=True)
indices_df['Date'] = pd.to_datetime(indices_df['Date'], utc=True)

# Remove invalid Date
stock_df = stock_df.dropna(subset=['Date'])
currency_df = currency_df.dropna(subset=['Date'])
indices_df = indices_df.dropna(subset=['Date'])

# Consider only date part
stock_df['Date'] = stock_df['Date'].dt.date
currency_df['Date'] = currency_df['Date'].dt.date
indices_df['Date'] = indices_df['Date'].dt.date

# Merge csv files using outer join
merged_df = pd.merge(stock_df, currency_df, on='Date', how='outer')
merged_df = pd.merge(merged_df, indices_df, on='Date', how='outer')

# Use only the relevant columns
merged_df = merged_df[['Date', 'Close_x', 'Close_y', 'Total_Sales_EUR_Million', 'EPS']]
merged_df.columns = ['Date', 'Stock_Price', 'Currency_Rate', 'Total_Sales', 'EPS']

# Ensure the 'Date' column is in datetime format and set as index
merged_df['Date'] = pd.to_datetime(merged_df['Date'], errors='coerce')
merged_df.set_index('Date', inplace=True)

# Create lag features for the columns 'Stock_Price', 'Currency_Rate', 'EPS', 'Total_Sales'
for i in range(1, 31):  # For each lag from 1 to 30
    merged_df[f'Stock_Price_Lag{i}'] = merged_df['Stock_Price'].shift(i)
    merged_df[f'Currency_Rate_Lag{i}'] = merged_df['Currency_Rate'].shift(i)
    merged_df[f'Total_Sales_Lag{i}'] = merged_df['Total_Sales'].shift(i)
    merged_df[f'EPS_Lag{i}'] = merged_df['EPS'].shift(i)

# Drop rows with NaN values generated by the shifting (so the model doesn't use incomplete data)
merged_df = merged_df.dropna()

# Define the lag features (now only for the actual columns)
lag_features = [f'Stock_Price_Lag{lag}' for lag in range(1, 31)] + \
               [f'Currency_Rate_Lag{lag}' for lag in range(1, 31)] + \
               [f'Total_Sales_Lag{lag}' for lag in range(1, 31)] + \
               [f'EPS_Lag{lag}' for lag in range(1, 31)]

# Prepare the data for training
X = merged_df[lag_features]
y = merged_df['Stock_Price']  # Or whichever target you want to predict

# Check if there are any NaN values
if X.isnull().values.any() or y.isnull().values.any():
    raise ValueError("There are NaN values in the dataset.")

# Split data into train and test sets (you can choose to use all historical data for training)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train the Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error and R^2 Score
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Plot results for test set predictions
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.legend()
plt.show()

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Load the merged data
merged_df = pd.read_csv('merged_data.csv')

# Convert the Date column to datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])  # <--- Fixed code

# Feature engineering: Create lag features for the past 30 days
lag_features = {}
for lag in range(1, 31):
    lag_features[f'Stock_Lag{lag}'] = merged_df['Close Stock Price'].shift(lag)
    lag_features[f'Currency_Lag{lag}'] = merged_df['Close Currency Rate'].shift(lag)
    lag_features[f'Sales_Lag{lag}'] = merged_df['Total_Sales_EUR_Million'].shift(lag)
    lag_features[f'EPS_Lag{lag}'] = merged_df['EPS'].shift(lag)

# Concatenate the lag features with the original DataFrame
lag_df = pd.DataFrame(lag_features)
merged_df = pd.concat([merged_df, lag_df], axis=1)

# Drop rows with NaN values resulting from lag features
merged_df.dropna(inplace=True)

# Define features and target
features = [f'Stock_Lag{lag}' for lag in range(1, 31)] + \
           [f'Currency_Lag{lag}' for lag in range(1, 31)] + \
           [f'Sales_Lag{lag}' for lag in range(1, 31)] + \
           [f'EPS_Lag{lag}' for lag in range(1, 31)]
target = 'Close Stock Price'

X = merged_df[features]
y = merged_df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

# Model prediction
y_pred = model.predict(X_test)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean-Squared Error: ", mse)
print("R-squared Score: ", r2)

# Predict the stock price for the next 30 days
last_row = merged_df.iloc[-1]
future_dates = pd.date_range(start=last_row['Date'], periods=30, freq='B')  # <--- Fixed code
future_data = pd.DataFrame(index=future_dates)

# Initialize future data with the last known values
for lag in range(1, 31):
    future_data[f'Stock_Lag{lag}'] = last_row[f'Stock_Lag{lag}']
    future_data[f'Currency_Lag{lag}'] = last_row[f'Currency_Lag{lag}']
    future_data[f'Sales_Lag{lag}'] = last_row[f'Sales_Lag{lag}']
    future_data[f'EPS_Lag{lag}'] = last_row[f'EPS_Lag{lag}']

# Predict future stock prices iteratively
future_predictions = []
for i in range(len(future_data)):
    prediction = model.predict(future_data.iloc[i].values.reshape(1, -1))
    future_predictions.append(prediction[0])
    if i + 1 < len(future_data):
        # Update the lag features for the next day
        future_data.iloc[i + 1, :30] = future_data.iloc[i, :30].values
        future_data.iloc[i + 1, 30:60] = future_data.iloc[i, 30:60].values
        future_data.iloc[i + 1, 60:90] = future_data.iloc[i, 60:90].values
        future_data.iloc[i + 1, 90:] = future_data.iloc[i, 90:].values
        future_data.iloc[i + 1, 0] = prediction[0]  # Update the stock price lag

# Add predictions to future_data
future_data['Predicted Stock Price'] = future_predictions

# Plot the results
plt.figure(figsize=(14, 7))
plt.plot(merged_df['Date'], merged_df['Close Stock Price'], label='Actual Stock Price')
plt.plot(future_data.index, future_data['Predicted Stock Price'], label='Predicted Stock Price', linestyle='--')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title('Stock Price Prediction for the Next 30 Days')
plt.legend()
plt.grid(True)
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'merged_data.csv'