In [1]:
import sqlite3
import pandas as pd
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split

#### Data Model Implementation (25 points)

In [24]:
# The data is cleaned, normalized, and standardized prior to modeling (5 points)
tesla = yf.Ticker('TSLA')
hist = tesla.history('1y')
hist[:5]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-09-14 00:00:00-04:00,271.320007,276.709991,270.420013,276.040009,107709800,0.0,0.0
2023-09-15 00:00:00-04:00,277.549988,278.980011,271.0,274.390015,133422800,0.0,0.0
2023-09-18 00:00:00-04:00,271.160004,271.440002,263.76001,265.279999,101543300,0.0,0.0
2023-09-19 00:00:00-04:00,264.350006,267.850006,261.200012,266.5,103704000,0.0,0.0
2023-09-20 00:00:00-04:00,267.040009,273.929993,262.459991,262.589996,122514600,0.0,0.0


In [121]:
# dropped Volume, Dividends, and Stock Splits
df = hist.drop(['Volume','Dividends','Stock Splits'],axis=1)

# changing date format
df.index = df.index.strftime('%Y-%m-%d %a')

# Dropping the first two rows to start on a Monday
df = df[2:]

In [3]:
# create and connect to sqlite database stock_db
db = sqlite3.connect('stock_db.sqlite')

In [None]:
# Create or replace tesla table and load df data to it.
df.to_sql('tesla',db, if_exists='replace')

In [4]:
# The model utilizes data retrieved from SQL or Spark (5 points)
# read table tesla from stock_db
df = pd.read_sql('SELECT * FROM tesla', db, index_col='Date')
df[:5]

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-09-18 Mon,271.160004,271.440002,263.76001,265.279999
2023-09-19 Tue,264.350006,267.850006,261.200012,266.5
2023-09-20 Wed,267.040009,273.929993,262.459991,262.589996
2023-09-21 Thu,257.850006,260.859985,254.210007,255.699997
2023-09-22 Fri,257.399994,257.790009,244.479996,244.880005


In [5]:
# Generating labels. Since I'm generating a label based on the next day's information, I need to have one less label.
print("Generating labels. Since I'm generating a label based on the next day's information, I need to have one less label.")
print(f'rows on dataframe: {len(df)}')

labels = []
for i in range(len(df)-1):
    today = df.iloc[i]
    tomorrow = df.iloc[i+1]
    
    if today.Open<tomorrow.Open:
        labels.append('Buy')
    else:
        labels.append('Sell')

print(f'Count of labels: {len(labels)}')

Generating labels. Since I'm generating a label based on the next day's information, I need to have one less label.
rows on dataframe: 250
Count of labels: 249


In [6]:
# A Python script initializes, trains, and evaluates a model (10 points)
# I need to drop last row of data so lables can match
df = df[:-1].copy()

# Adding labels to dataframe
df['Recomendation'] = labels

# displaying dataframe
df

Unnamed: 0_level_0,Open,High,Low,Close,Recomendation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-09-18 Mon,271.160004,271.440002,263.760010,265.279999,Sell
2023-09-19 Tue,264.350006,267.850006,261.200012,266.500000,Buy
2023-09-20 Wed,267.040009,273.929993,262.459991,262.589996,Sell
2023-09-21 Thu,257.850006,260.859985,254.210007,255.699997,Sell
2023-09-22 Fri,257.399994,257.790009,244.479996,244.880005,Sell
...,...,...,...,...,...
2024-09-06 Fri,232.600006,233.600006,210.509995,210.729996,Sell
2024-09-09 Mon,216.199997,219.869995,213.669998,216.270004,Buy
2024-09-10 Tue,220.070007,226.399994,218.639999,226.169998,Buy
2024-09-11 Wed,224.550003,228.470001,216.800003,228.130005,Buy


In [7]:
# The data is cleaned, normalized, and standardized prior to modeling (5 points)
# splitting data into X and y
X = df.drop('Recomendation',axis=1).values
y = df.Recomendation.values

In [8]:
X[:5]

array([[271.16000366, 271.44000244, 263.76000977, 265.27999878],
       [264.3500061 , 267.8500061 , 261.20001221, 266.5       ],
       [267.04000854, 273.92999268, 262.45999146, 262.58999634],
       [257.8500061 , 260.85998535, 254.21000671, 255.69999695],
       [257.3999939 , 257.79000854, 244.47999573, 244.88000488]])

In [9]:
y[:5]

array(['Sell', 'Buy', 'Sell', 'Sell', 'Sell'], dtype=object)

In [14]:
# Tokenizing labels
new_y = [ 1 if v=='Sell' else 0 for v in y]
new_y[:5]

[1, 0, 1, 1, 1]

In [16]:
# Split data into training and validation
X_train, X_test, y_train, y_test = train_test_split(X,new_y)

In [20]:
# The model demonstrates meaningful predictive power at least 75% 
# classification accuracy or 0.80 R-squared. (5 points)

model = LinearRegression()
model.fit(X_train,y_train)

In [30]:
# Prediction score
f'The validation accuracy score of {model.score(X_test,y_test)*100:.2f}% does not meet standards.'

'The validation accuracy score of 46.57% does not meet standards.'

#### Data Model Optimization (25 points)

In [34]:
new_y = [ 'Buy' if x>=0 else 'Sell' for x in df.Close - df.Open]
new_y[:5]

['Sell', 'Buy', 'Sell', 'Sell', 'Sell']

In [35]:
X[:5]

array([[271.16000366, 271.44000244, 263.76000977, 265.27999878],
       [264.3500061 , 267.8500061 , 261.20001221, 266.5       ],
       [267.04000854, 273.92999268, 262.45999146, 262.58999634],
       [257.8500061 , 260.85998535, 254.21000671, 255.69999695],
       [257.3999939 , 257.79000854, 244.47999573, 244.88000488]])

In [37]:
new_y = [1 if x=='Sell' else 0 for x in new_y]

In [39]:
# Split data into training and validation
X_train, X_test, y_train, y_test = train_test_split(X,new_y)

In [41]:
model2 = LinearRegression()
model2.fit(X_train,y_train)

In [43]:
model2.score(X_train,y_train)

0.602776094717671