In [1]:
LASSO REGRESSION : SQL DATABASE (Ms

NameError: name 'RIDGE' is not defined

#### 1. Loading Dataset

In [None]:
import os
import sqlite3
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from tabulate import tabulate

#### 2. Connecing Data Base

In [None]:
#connecting database
def connect_database(database_name):
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()
    return conn, cursor

In [None]:
database = "stocks_master.db"
conn, cursor = connect_database(database)

In [None]:
if conn and cursor:
    #finding tables in the database
    def find_tables(conn):
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        return [table[0] for table in tables]
        
    tables_in_database = find_tables(conn)
    print(tables_in_database)
    #closing the connection
    conn.close()
else:
    print("Exiting due to connection error.")

In [None]:
tables_in_database[0]

#### 3. Feature Selection  

In [None]:
master_database_name = "stocks_master.db"
master_conn = sqlite3.connect(master_database_name)

In [None]:
if master_conn:
    #tables to query
    tables_to_query = ['stock_prices_aapl', 'stock_prices_msft']
    #iterate over the tables in the master database
    for table_name in tables_to_query:
        #new cursor for the query
        query_cursor = master_conn.cursor()
        query = f"SELECT Date, Open, High, Low, Close, 'Adj Close' AS 'Adjusted Close', Volume FROM {table_name} ORDER BY Date DESC LIMIT 10"
        query_cursor.execute(query)
        data = query_cursor.fetchall()
        #extracting columns from the data
        result_list = [list(row) for row in data]
        #tabular format
        print(f"\nTop five rows of {table_name}:\n")
        print(tabulate(result_list, headers=["Date", "Open", "High", "Low", "Close", "Adjusted Close", "Volume"], tablefmt="grid"))
else:
    print("Master connection is closed.")

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [None]:
def fetch_closing_prices_and_dates(table_name, symbol):
    query_cursor = master_conn.cursor()
    query = f"SELECT Date, Open, High, Low, Close, 'Adj Close' AS 'Adjusted Close', Volume FROM {table_name} ORDER BY Date DESC LIMIT 900"
    query_cursor.execute(query)
    data = query_cursor.fetchall()
    closing_prices = [float(row[1]) for row in data]
    dates = [row[0] for row in data]
    return dates, closing_prices

In [None]:
x_table_name = 'stock_prices_msft'
y_table_name = 'stock_prices_aapl'

In [None]:
x_dates, x_closing_prices = fetch_closing_prices_and_dates(x_table_name, 'MSFT')
y_dates, y_closing_prices = fetch_closing_prices_and_dates(y_table_name, 'AAPL')

In [None]:
common_dates = set(x_dates) & set(y_dates)

In [None]:
X_closing_prices = [x_closing_prices[x_dates.index(date)] for date in common_dates]
y_closing_prices = [y_closing_prices[y_dates.index(date)] for date in common_dates]

In [None]:
print(f"Length of X: {len(X_closing_prices)}")
print(f"Length of Y: {len(y_closing_prices)}")

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(X_closing_prices, y_closing_prices, color='black', alpha=0.5)
plt.title("Scatter Plot between Microsoft and Apple")
plt.xlabel("MSFT")
plt.ylabel("AAPL")
plt.grid(True, ls='--', color='black', alpha=0.2)
plt.show()

In [None]:
X = np.array(X_closing_prices).reshape(-1, 1)
y = np.array(y_closing_prices)

#### 4. Feature Scaling 

In [None]:
#Z-scores 
X = (X - X.mean()) / X.std()

In [None]:
#splitting data into train and test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#polynomial regression
degree = 4
poly_features = PolynomialFeatures(degree=degree)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)

#### 5. Model Selection

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_model = Lasso(alpha=0.001, max_iter=50000)  
lasso_model.fit(X_poly_train, y_train)

In [None]:
line_test = np.linspace(min(X_test), max(X_test), 100).reshape(-1, 1)

In [None]:
line_poly = poly_features.transform(line_test)

In [None]:
y_pred = lasso_model.predict(line_poly)

In [None]:
y_pred[:5]

#### 6. Best Fit Line (Predictive Analaysis)

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(X_train, y_train, color='blue', alpha=0.7, label='Train')
plt.scatter(X_test, y_test, color='black', alpha=0.7, label='Test')
plt.plot(line_test, y_pred, color='red', label=f'Lasso Regression')
plt.xlabel("MSFT Closing Prices")
plt.ylabel("AAPL Closing Prices")
plt.grid(True, ls='--', color='black', alpha=0.2)
plt.legend()
plt.show()

#### 7. Model Evaluation 

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
#R-squared for the model
r2 = r2_score(y_test, lasso_model.predict(X_poly_test))
#mean squared error (MSE) for the model
mse = mean_squared_error(y_test, lasso_model.predict(X_poly_test))
#RMSE for the model
rmse = np.sqrt(mean_squared_error(y_test, lasso_model.predict(X_poly_test)))
#MAE for the model
mae = mean_absolute_error(y_test, lasso_model.predict(X_poly_test))
print(f'R-squared: {r2}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

In [None]:
#residuals
residuals = y_test - lasso_model.predict(X_poly_test)

In [None]:
#residuals against predicted values
plt.figure(figsize=(6,5))
plt.scatter(lasso_model.predict(X_poly_test), residuals, color='blue', marker='o', s=40, alpha=0.6)
plt.title('Residual Plot for LASSO')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.grid(True, ls='--', color='black', alpha=0.2)
plt.show()

In [None]:
from scipy.stats import probplot

In [None]:
#QQ plot
plt.figure(figsize=(6,5))
probplot(residuals, plot=plt)
plt.title('QQ Plot for Residuals of LASSO')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Sample Quantiles')
plt.grid(True, ls='--', color='black', alpha=0.2)
plt.show()

#### References: 

1. Linear Regression in SQL: Is it Possible? [Medium](https://medium.com/swlh/linear-regression-in-sql-is-it-possible-b9cc787d622f)
2. Linear Regression in SQL [Towards Data Science](https://towardsdatascience.com/linear-regression-in-sql-62eaf4861290)
