In [None]:
import numpy as np
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#nltk natural language process toolkit, tokenization, stemming, and sentiment analysis.
#SentimentIntensityAnalyser designed to analyze sentiment in text ,
#Valence Aware Dictionary and Sentiment Reasoner (VADER) particularly effective on social media tweets

import nltk

nltk.download('vader_lexicon')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()
def sentiment_analysis(tweet):
    score = analyzer.polarity_scores(tweet)["compound"]
    return score

Importing our data


In [None]:
tweet_data=pd.read_csv('C:\Users\Dhanush kumar\Downloads\archive (3)\stock_tweets.csv')
tweet_data.rename(columns={"Stock Name": "Stock_Name"}, inplace=True)
tweet_data.rename(columns={"Company Name": "Company_Name"}, inplace=True)
tweet_data.head()

In [None]:
stock_data=pd.read_csv("C:\Users\Dhanush kumar\Downloads\archive (3)\stock_yfinance_data.csv")
stock_data.rename(columns={"Stock Name": "Stock_Name"}, inplace=True)
stock_data.head()

In [None]:
stock_data["Date"] = pd.to_datetime(stock_data["Date"]).dt.strftime("%Y-%m-%d")
stock_data.head()

In [None]:
# Function to fetch historical data
def fetch_historical_data(symbol):
    data = yf.download(symbol, start="2020-01-01", end="2024-10-17")
    return data

In [None]:
##  add column to the stock price dataframe which shows the max stock price fluctuation
# flucation price difference between high and low
stock_data['Fluctuation'] = stock_data.High - stock_data.Low
## add column to the stock price dataframe which shows the net rise in stock price
#price_Gain difference between open and close price
stock_data['Price_Gain'] = stock_data.Close - stock_data.Open
##  add column to the stock price dataframe which shows the total valuation at the end of the day
# total_Valuation_EOD(end of day)
stock_data['Total_Valuation_EOD'] = stock_data.Volume * stock_data.Close

In [None]:
stock_data.head()

Data preprocessing

In [None]:
import re
#re used to import rregular expressions
# Convert 'Tweet' column to string type to handle potential NaNs or numbers
tweet_data['Tweet'] = tweet_data['Tweet'].astype(str)
tweet_data['Tweet'] = tweet_data.apply(lambda row: row['Tweet'].lower(),axis=1) #removed capitalisation
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub("@[A-Za-z0-9_]+","", row['Tweet']),axis=1) #removed mentions
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub("#[A-Za-z0-9_]+","", row['Tweet']),axis=1) #removed hashtags
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub(r"http\S+","", row['Tweet']),axis=1) #removed websites
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub(r"www.\S+","", row['Tweet']),axis=1)
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub('[()!?]'," ", row['Tweet']),axis=1) #removed puncs
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub('\[.*?\]'," ", row['Tweet']),axis=1)
tweet_data['Tweet'] = tweet_data.apply(lambda row: re.sub("[^a-z]"," ", row['Tweet']),axis=1)

tweet_data[['Tweet']].head()

In [None]:
tweet_data['Sentiment'] = tweet_data['Tweet'].apply(lambda x : sentiment_analysis(x))
tweet_data.head()

In [None]:
#creting date_string for tweet_data
#anchor is created by adding tweet date and stock name
# creating Anchor Column for tweet_data
tweet_data.insert(1, "Date_string", tweet_data.Date.astype("str").str.split(" "))
tweet_data.Date_string = [element[0] for element in tweet_data.Date_string]
tweet_data.insert(0, "anchor", tweet_data.Date_string + tweet_data.Stock_Name)

#creating date_string for stock_data
# creating Anchor Column for stock_data
stock_data.insert(1, "Date_string", stock_data.Date.astype("str").str.split(" "))
stock_data.Date_string = [element[0] for element in stock_data.Date_string]
stock_data.insert(0, "anchor", stock_data.Date_string + stock_data.Stock_Name)


In [None]:
tweet_data.head()

Show Distribution of Positive , negative and netural counts

In [None]:
positive_count = (tweet_data['Sentiment'] > 0).sum()  # count positive values
negative_count = (tweet_data['Sentiment'] < 0).sum()  # count negative values
zero_count = (tweet_data['Sentiment'] == 0).sum()  # count zero values

# display counts
print("Positive Count:", positive_count)
print("Negative Count:", negative_count)
print("Zero Count:", zero_count)

labels = ['Positive', 'Negative' , 'Zero']
sizes = [positive_count, negative_count, zero_count]
colors = ['g', 'r', 'y' ]

# pie chart
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Distribution of Positive and Negative')
plt.show()

The above pie chart shows the distribution of positive, negative and netural values


In [None]:
stock_data.head()


In [None]:
# merging the two dataframe on the anchor in single data frame

data = pd.merge(tweet_data, stock_data , on="anchor")
data.head(10)

In [None]:
# let us convert the string date column "date_str_x" to datetime
data.Date_string_x = pd.to_datetime(data.Date_string_x)
# since we are aiming to see the impact of tweets on stock value (i.e. rise and fall),
# we can drop "neutral" sentiments
data = data[data.Sentiment != 0]
data.head()

In [None]:
## only keeping the valuable  necessary data from processing
preprocessed_data=data[
    [
        "Date_x",
        "Date_string_x",
        "Tweet",
        "Stock_Name_x",
        "Company_Name",
        "Sentiment",
        "Open",
        "High",
        "Low",
        "Close",
        "Volume",
        "Fluctuation",
        "Price_Gain",
        "Total_Valuation_EOD"
    ]
]
preprocessed_data = preprocessed_data.copy()
## counter is use to count number of positive and negative tweets per day
## counter= daily tweet volume
preprocessed_data["counter"] = 1
preprocessed_data.head(10)


Processing no of psitive trend per-day

In [None]:
positive_tweets = preprocessed_data[preprocessed_data['Sentiment'] > 0]
positive_tweets_per_day = positive_tweets.groupby('Date_string_x').size()
plt.figure(figsize=(15, 6))
positive_tweets_per_day.plot(kind='line', marker='o', color='green')
plt.title('Number of Positive Tweets per Day')
plt.xlabel('Date')
plt.ylabel('Number of Positive Tweets')
plt.show()

charts of total no of negative trends tweets per-day

In [None]:
negative_tweets = preprocessed_data[preprocessed_data['Sentiment'] < 0]
negative_tweets_per_day = negative_tweets.groupby('Date_string_x').size()
plt.figure(figsize=(15, 6))
negative_tweets_per_day.plot(kind='line', marker='o', color='red')
plt.title('Number of Negative Tweets per Day')
plt.xlabel('Date')
plt.ylabel('Number of Negative Tweets')
plt.show()

Showing total comapanies and no of tweets by company

In [None]:
print(
    f"In our dataset, we have total {len(preprocessed_data.Company_Name.value_counts())} companies, namely\n{preprocessed_data.Company_Name.value_counts()}"
)
## pie chart of number of tweets to company

company_counts = tweet_data['Company_Name'].value_counts()

plt.figure(figsize=(8, 6))
plt.pie(company_counts, labels=company_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Number of Tweets by Company')
plt.axis('equal')
plt.show()

Making individual  dataset fo r each companies for  APPLE (APPL), TESLA (TSLA) ,TAIWAN SEMICONDUCTOR (TSM)

In [None]:
# let us make datasets for top 3 companies
tesla_df = preprocessed_data[preprocessed_data.Stock_Name_x == "TSLA"]
taiwanSMC_df = preprocessed_data[preprocessed_data.Stock_Name_x == "TSM"]
apple_df = preprocessed_data[preprocessed_data.Stock_Name_x == "AAPL"]


Creating dataframe of company based on positive and negative tweets

In [None]:
# for simplicity, we will further form 2 sub dataframes per company based on the sentiments: positive and negative
pos_tesla_df = tesla_df[tesla_df.Sentiment > 0]
pos_taiwanSMC_df = taiwanSMC_df[taiwanSMC_df.Sentiment >0]
pos_apple_df = apple_df[apple_df.Sentiment > 0]

neg_tesla_df = tesla_df[tesla_df.Sentiment <0]
neg_taiwanSMC_df = taiwanSMC_df[taiwanSMC_df.Sentiment <0]
neg_apple_df = apple_df[apple_df.Sentiment <0]

In [None]:
# let us create dataset with limited values that give us a brief info about rise and fall in total valuation of the company over time
ovr_pos_tesla_df = pos_tesla_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})
ovr_pos_taiwanSMC_df = pos_taiwanSMC_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})
ovr_pos_apple_df = pos_apple_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})

ovr_neg_tesla_df = neg_tesla_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})
ovr_neg_taiwanSMC_df = neg_taiwanSMC_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})
ovr_neg_apple_df = neg_apple_df.groupby(by=["Date_string_x","Fluctuation", "Price_Gain", "Total_Valuation_EOD","Sentiment"], as_index=False).agg({"counter":pd.Series.sum})

CASE 1 : TESLA

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(15, 6))
plt.title("Tesla: Effect of positive Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    pos_tesla_df.Date_string_x,
    pos_tesla_df.Total_Valuation_EOD,
    color="y",
    label="Valuation",
)


positive_tweets = tesla_df[tesla_df['Sentiment'] > 0]
positive_tweets_per_day = positive_tweets.groupby('Date_string_x').size()


ax2.plot(
    pos_tesla_df.Date_string_x.unique(),
    positive_tweets_per_day,
    color="g",
    label="Positive Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Positive Tweets")

plt.legend()
plt.show()

In [None]:
# correlation matrix
corr = ovr_pos_tesla_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

In [None]:
plt.figure(figsize=(15, 6))
plt.title("Tesla: Effect of negative Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    neg_tesla_df.Date_string_x,
    np.log(neg_tesla_df.Price_Gain),
    color="y",
    label="Valuation",
)


negative_tweets = tesla_df[tesla_df['Sentiment'] < 0]
negative_tweets_per_day = negative_tweets.groupby('Date_string_x').size()


ax2.plot(
    neg_tesla_df.Date_string_x.unique(),
    negative_tweets_per_day,
    color="r",
    label="Negative Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Negative Tweets")

plt.legend()
plt.show()

In [None]:
# correlation matrix
corr = ovr_neg_tesla_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

CASE 2 : TAIWAN SMC

In [None]:
plt.figure(figsize=(15, 6))
plt.title("Taiwan SMC: Effect of Positive Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    pos_taiwanSMC_df.Date_string_x,
    pos_taiwanSMC_df.Total_Valuation_EOD,
    color="y",
    label="Valuation",
)


positive_tweets = taiwanSMC_df[taiwanSMC_df['Sentiment'] >0]
positive_tweets_per_day = positive_tweets.groupby('Date_string_x').size()


ax2.plot(
    pos_taiwanSMC_df.Date_string_x.unique(),
    positive_tweets_per_day,
    color="g",
    label="Positive Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Positive Tweets")

plt.legend()
plt.show()

In [None]:
# correlation matrix
corr = ovr_pos_taiwanSMC_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

In [None]:
plt.figure(figsize=(15, 6))
plt.title("Taiwan SMC: Effect of negative Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    neg_taiwanSMC_df.Date_string_x,
    np.log(neg_taiwanSMC_df.Price_Gain),
    color="y",
    label="Valuation",
)


negative_tweets = taiwanSMC_df[taiwanSMC_df['Sentiment'] <0]
negative_tweets_per_day = negative_tweets.groupby('Date_string_x').size()


ax2.plot(
    neg_taiwanSMC_df.Date_string_x.unique(),
    negative_tweets_per_day,
    color="r",
    label="Negative Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Negative Tweets")

plt.legend()
plt.show()

In [None]:
# correlation matrix
corr = ovr_neg_taiwanSMC_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

CASE 3 : APPLE

In [None]:
plt.figure(figsize=(15, 6))
plt.title("Apple: Effect of Positive Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    pos_apple_df.Date_string_x,
    pos_apple_df.Total_Valuation_EOD,
    color="y",
    label="Valuation",
)

positive_tweets = apple_df[apple_df['Sentiment'] > 0]
positive_tweets_per_day = positive_tweets.groupby('Date_string_x').size()


ax2.plot(
    pos_apple_df.Date_string_x.unique(),
    positive_tweets_per_day,
    color="g",
    label="Positive Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Positive Tweets")

plt.legend()
plt.show()

creating heat for apple company dataframe

In [None]:
# correlation matrix
corr = ovr_pos_apple_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

In [None]:
plt.figure(figsize=(15, 6))
plt.title("Apple: Effect of negative Tweets on Valuation")
ax1 = plt.gca()
ax2 = plt.twinx()


ax1.plot(
    neg_apple_df.Date_string_x,
    np.log(neg_apple_df.Price_Gain),
    color="y",
    label="Valuation",
)


negative_tweets = apple_df[apple_df['Sentiment'] < 0]
negative_tweets_per_day = negative_tweets.groupby('Date_string_x').size()


ax2.plot(
    neg_apple_df.Date_string_x.unique(),
    negative_tweets_per_day,
    color="r",
    label="Negative Tweets",
)

ax1.set_xlabel("Time")
ax1.set_ylabel("Valuation")
ax2.set_ylabel("Negative Tweets")

plt.legend()
plt.show()

In [None]:

# correlation matrix
corr = ovr_neg_apple_df.corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# drawing heatmap
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    annot=True,
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    linewidths=.5,
    cbar_kws={"shrink": .5}

)

Tesla Training

SVM based prediction of stock movement based on sentiment


In [None]:
## importing libraries
#using SVM support vector machine algorithm
#to find a hyperplane that, to the best degree possible,
# separates data points of one class from those of another class
from sklearn import svm
from sklearn.model_selection import train_test_split
# train_test_split creating training and test data sets.
from sklearn.preprocessing import MinMaxScaler
#MinMaxscalerStandardizes features by scaling each feature to a given range
#taking necessary columns from the dataframe
tesla=tesla_df
drop=['Date_x','Stock_Name_x','Company_Name','Sentiment','Open','High','Low','Volume','Fluctuation','Price_Gain','Total_Valuation_EOD']
tesla.drop(columns=drop, inplace=True)
tesla

getting sentiment score for the tweet based on positive , negative and netural

In [None]:
def get_sentiment_scores(tweet):
    scores = analyzer.polarity_scores(tweet)
    return scores

tesla['Sentiment_Scores'] = tesla['Tweet'].apply(get_sentiment_scores)
tesla[['Positive', 'Neutral', 'Negative', 'Compound']] = tesla['Sentiment_Scores'].apply(pd.Series)
tesla

In [None]:
tesla['Sentiment']=tesla['Compound']
drop=['Sentiment_Scores','Positive','Neutral','Negative','Compound','Tweet']
tesla.drop(columns=drop, inplace=True)
tesla

In [None]:
tesla2=tesla

tesla2['Date'] = pd.to_datetime(tesla2['Date_string_x']).dt.date

# grouping by date, calculate mean sentiment and count number of tweets
final = tesla2.groupby('Date').agg({
    'Sentiment': 'mean',
    'Date_string_x': 'count',
    'Close': 'last'
}).reset_index()
final.columns = ['Date', 'Mean_Sentiment', 'Num_Tweets', 'Close']
print(final)

predicting value   based on 3 adys of previous closing price

In [None]:
## predicting closing value of stock based on a 3 day window of previous closing prices, sentiments, and number of tweets
#sliding window, Feature Engineering, Time Series Forecasting
def window_data(df, window, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number):
    # creating empty lists "X_close", "X_sentiment", "X_ts" and y
    X_close = []
    X_sentiment = []
    X_ts = []
    y = []
    for i in range(len(df) - window):

        close = df.iloc[i:(i + window), feature_col_number1]
        ts_sentiment = df.iloc[i:(i + window), feature_col_number2]
        tw_vol = df.iloc[i:(i + window), feature_col_number3]
        target = df.iloc[(i + window), target_col_number]

        X_close.append(close)
        X_sentiment.append(ts_sentiment)
        X_ts.append(tw_vol)
        y.append(target)

    return np.hstack((X_close,X_sentiment,X_ts)), np.array(y).reshape(-1, 1)

In [None]:
#window_size  function will use the past 3 days of data to predict the closing price of a stock.
window_size = 3
# column index 3 is the `Close` column
# column index 1 is the `Mean_Sentiment` column
# column index 2 is the `Num_tweets` column
feature_col_number1 = 3
feature_col_number2 = 1
feature_col_number3 = 2
target_col_number = 3
X, y = window_data(final, window_size, feature_col_number1, feature_col_number2, feature_col_number3, target_col_number)
print(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)


In [None]:
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import r2_score
model = svm.SVR()
# Fit the model
model.fit(X_train, y_train.ravel())
print(model.score(X_test,y_test))

In [None]:
y_pred = model.predict(X_test)
print(y_pred)
print(r2_score(y_test, y_pred))

In [None]:
predicted_prices = y_pred.reshape(-1, 1)
real_prices = y_test.reshape(-1, 1)
stocks = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
})
stocks['Date'] = final['Date'][-len(real_prices):].reset_index(drop=True)

stocks.set_index('Date', inplace=True)
stocks.head()

plt.plot(stocks['Real'], color='r', label='real')
plt.plot(stocks['Predicted'], color='g', label='predicted')
plt.title("Actual vs Predicted Values: SVM")
plt.xlabel('Date')
plt.legend()
plt.xticks(rotation=45)

plt.show()

Observation :
Clearly we can see that the tweet volume and sentiment score are unable to predict the closing price of stock.
But can we infer the general trend of the market using the tweet volume and sentiment score. This can be done by comparing slopes of the predicted price and real price of stock with time.

In [None]:

real_values = stocks['Real'].values
predicted_values = stocks['Predicted'].values

real_slopes = np.diff(real_values)
real_slopes = np.insert(real_slopes, 0, 0)

predicted_slopes = np.diff(predicted_values)
predicted_slopes = np.insert(predicted_slopes, 0, 0)


In [None]:
real_slope_signs = np.sign(real_slopes)

predicted_slope_signs = np.sign(predicted_slopes)

Accuracy of comparing slopes is 80.95% showing that while tweet volume and sentiment score are not good metrics to model the closing price of a stock. They can be used to predict the general trend of the market to a good degree

In [None]:
matching_signs = (real_slope_signs == predicted_slope_signs)

accuracy = np.mean(matching_signs) * 100
print(f"Accuracy of slope sign comparison: {accuracy}%")

Linear Reg
This is a class from the sklearn.linear_model module used to create a linear regression model.
it attempts to model the relationship between a dependent variable

In [None]:
tesla = tweet_data[tweet_data['Stock_Name'] == 'TSLA']


In [None]:
tesla = tesla[tesla['Sentiment'] != 0]


In [None]:
tesla = tesla.drop(columns=['anchor', 'Date', 'Company_Name'], axis=1)


In [None]:
tesla.head()


In [None]:
dates = tesla['Date_string'].unique()
average = {}
for i in dates:
    filtered_date = tesla[tesla['Date_string'] == i]
    average_value = filtered_date['Sentiment'].mean()
    average[i] = average_value
tesla_f = pd.DataFrame(list(average.items()), columns=['Date', 'Sentiment'])

In [None]:
tesla_f


In [None]:
tesla_stock = stock_data[stock_data['Stock_Name'] == 'TSLA']
stock_f = tesla_stock.drop(columns=['anchor', 'Date_string', 'Open', 'High', 'Low', 'Adj Close', 'Volume', 'Stock_Name', 'Fluctuation', 'Price_Gain', 'Total_Valuation_EOD'], axis=1)
stock_f.head()

In [None]:
tesla_p = pd.merge(tesla_f, stock_f, on='Date', how='inner')


In [None]:

tesla_p['Close_1'] = tesla_p['Close']
tesla_p['Close_2'] = tesla_p['Close']
tesla_p['Close_3'] = tesla_p['Close']

In [None]:
tesla_p['Close_1'] = tesla_p['Close'].shift(1)
tesla_p['Close_2'] = tesla_p['Close'].shift(2)
tesla_p['Close_3'] = tesla_p['Close'].shift(3)


In [None]:
tesla_p = tesla_p.drop(0)


In [None]:
tesla_p = tesla_p.drop(1)

In [None]:
tesla_p = tesla_p.drop(2)

tesla_d = tesla_p

Linear Regression
This is a class from the sklearn.linear_model module used to create a linear regression model.
it attempts to model the relationship between a dependent variable

r2_score function from sklearn.metrics is used to calculate the R² (R-squared) score, which is a measure of how well the model's predictions fit the actual data. It provides insights into the proportion of variance in the target variable that is predictable from the independent variables.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


X = tesla_p[['Sentiment', 'Close_1', 'Close_2', 'Close_3']]  # using sentiment as the feature
y = tesla_p['Close']      # predicting the closing price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# model training
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Actual vs. Predicted')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Ideal Line')
plt.xlabel('Actual Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('Actual vs. Predicted Values: Linear Regression')
plt.legend()
plt.show()

StandardScaler -  This is a preprocessing tool from sklearn used to standardize features by removing the mean and scaling to unit variance. Standardization can improve the performance of neural networks.

Keras: A high-level neural networks API, Keras is used to build and train deep learning models. It runs on top of TensorFlow or Theano.


Sequential: This class allows you to build a model layer by layer, making it straightforward to define the architecture of a feedforward neural network.

Dense: This represents a fully connected layer in the neural network

In [None]:
from sklearn.preprocessing import StandardScaler
# standardizing features by removing the mean and scaling to unit variance
import keras
from keras.models import Sequential
from keras.layers import Dense
#close1, close2,close3 preivous closing price of 3days
X = tesla_p[['Sentiment', 'Close_1', 'Close_2', 'Close_3']]
y = tesla_p['Close']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# building the ANN model
#artificial neural network  is built using multiple layers,
#with each layer consisting of interconnected neurons. The first layer receives input data,
# and subsequent layers transform the data through learned weights and activation functions.
#ann gather their knowledge by detecing patterns and relationship  in data and learn
model = Sequential()

model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))  # for regression - no activation function
model.compile(optimizer='adam', loss='mean_squared_error')
# model training
model.fit(X_train, y_train, epochs=1000, batch_size=32, verbose=1)

y_pred = model.predict(X_test)
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R-squared score: {r2}")

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Actual vs. Predicted')
plt.plot(y_test, y_test, color='red', linestyle='--', label='Ideal Line')
plt.xlabel('Actual Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('Actual vs. Predicted Values: ANN')
plt.legend()
plt.show()