## Model Development

#### import data

In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
df = pd.read_csv('data/df_fs.csv', sep=",", index_col=0)

In [47]:
df.head()

Unnamed: 0,date,Year,Month,Day,DayOfWeek,Timestamp,Open,Low,Close,Volume,...,EMA_90,RSI,EMA_12,EMA_26,MACD,Signal_Line,MACD_Histogram,Price_Change_Pct,Article Length,article_sentiment
0,2016-07-20,2016,7,20,2,1468972800,56.84,55.529999,55.91,89893300.0,...,50.523063,71.721804,52.735685,52.177104,0.558581,0.040408,0.518174,5.311734,1394.0,4.0
1,2016-07-25,2016,7,25,0,1469404800,56.740002,56.259998,56.73,25610600.0,...,50.65948,74.606512,53.350195,52.514356,0.83584,0.199494,0.636346,0.282835,623.0,4.0
2,2016-08-01,2016,8,1,0,1470009600,56.75,56.139999,56.580002,26003400.0,...,50.789601,91.388238,53.847088,52.815515,1.031574,0.36591,0.665664,-0.176426,269.0,4.0
3,2016-08-03,2016,8,3,2,1470182400,57.110001,56.490002,56.970001,22075600.0,...,50.925434,91.799325,54.327537,53.123254,1.204282,0.533585,0.670698,0.689288,904.0,4.0
4,2016-08-05,2016,8,5,4,1470355200,58.209999,57.450001,57.959999,29335200.0,...,51.08004,92.480413,54.886377,53.481532,1.404845,0.707837,0.697009,0.993204,679.0,1.0


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4983 entries, 0 to 4982
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               4983 non-null   object 
 1   Year               4983 non-null   int64  
 2   Month              4983 non-null   int64  
 3   Day                4983 non-null   int64  
 4   DayOfWeek          4983 non-null   int64  
 5   Timestamp          4983 non-null   int64  
 6   Open               4983 non-null   float64
 7   Low                4983 non-null   float64
 8   Close              4983 non-null   float64
 9   Volume             4983 non-null   float64
 10  30_day_MA          4983 non-null   float64
 11  60_day_MA          4983 non-null   float64
 12  90_day_MA          4983 non-null   float64
 13  SMA_30             4983 non-null   float64
 14  SMA_60             4983 non-null   float64
 15  SMA_90             4983 non-null   float64
 16  EMA_30             4983 non-n

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


# Define relevant features and the target variable
additional_features = ['RSI', 'Volume', 'MACD_Histogram', 'Timestamp', 'Day','Signal_Line', 'Month', 'DayOfWeek', 'Close', 'Article Length', 'Open', 'Low']

#additional_features = ['Open', 'Close', 'Volume', '30_day_MA', 'SMA_30', 'EMA_30', 'Article Length','RSI', 'MACD_Histogram']  

X = df[['article_sentiment'] + additional_features]  # Include 'article_sentiment' and other features
y = df['Price_Change_Pct']  # Target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a linear regression model
lin_reg = LinearRegression()

# Fit the model to the training data
lin_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = lin_reg.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output model coefficients and performance metrics
coefficients = dict(zip(X.columns, lin_reg.coef_))  # Get model coefficients
mae, mse, r2, coefficients  # Display MAE, MSE, R-squared, and coefficients


(0.8732792884842246,
 1.9740970020462425,
 0.5613875753491191,
 {'article_sentiment': 0.012809936148022122,
  'RSI': 0.13324843481576948,
  'Volume': 0.0176885709363922,
  'MACD_Histogram': 0.546367688386812,
  'Timestamp': 0.13469314707500837,
  'Day': 0.03401066267974417,
  'Signal_Line': 0.06852442678607701,
  'Month': -0.037463063333052715,
  'DayOfWeek': 0.010359130695396346,
  'Close': 40.17292025919494,
  'Article Length': 0.004923471318323185,
  'Open': -20.806406761066462,
  'Low': -19.434645193586682})

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"date": dummy_combined['index'],
                   "compound sentiment score": dummy_combined['compound'], 
                   "Stock Close Price": dummy_combined['4. close']})

ax = df.plot(x="date", y="compound sentiment score", legend=False)
ax2 = ax.twinx()
df.plot(x="date", y="Stock Close Price", ax=ax2, legend=False, color="r")
ax.figure.legend()
plt.show()