In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
import yfinance as yf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [5]:
data = pd.read_csv('top_50_stocks_data_formatted.csv')

categorical_columns = ['Ticker']
numeric_columns = ['Open', 'Close', 'High', 'Low', 'Adjusted Close', 'Volume']

In [6]:
# Specifying preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [7]:
# Defining model pipeline
model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mlp', MLPRegressor(max_iter=500, random_state=42))
])

In [8]:
# Specifing grid for hyperparameter tuning
param_grid = {
    'mlp__hidden_layer_sizes': [(30,30,30), (50,50,50), (50,100,50)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['sgd', 'adam'],
}

In [9]:
# Using GridSearchCV to find best hyperparameters
grid_search = GridSearchCV(model2, param_grid, cv=5)

In [10]:
# Separating features and target
X = data[categorical_columns + numeric_columns]
y = data['Close']

In [11]:
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fitting model
grid_search.fit(X_train, y_train)

In [None]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num', Pipeline(steps=[('scaler', StandardScaler())]),
                                                                         numerical_columns),
                                                                        ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]),
                                                                         categorical_columns)])),
                                       ('mlp', MLPRegressor(max_iter=500, random_state=42))]),
             param_grid={'mlp__activation': ['tanh', 'relu'],
                         'mlp__hidden_layer_sizes': [(30, 30, 30), (50, 50, 50), (50, 100, 50)],

In [None]:
# Displaying best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Making predictions
y_pred = grid_search.predict(X_test)

In [12]:
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

NameError: name 'y_pred' is not defined

In [None]:
# Plot actual vs predicted
plt.scatter(y_test, y_pred)
plt.plot(y_test, y_test, color='r')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()