In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
from joblib import dump
import warnings

# Ingore some warginigs
warnings.filterwarnings("ignore", message="is_sparse is deprecated", category=FutureWarning)
warnings.filterwarnings("ignore", message="is_sparse is deprecated", category=FutureWarning)

file_path = r'data/Twitter_Data.csv'
df = pd.read_csv(file_path)

# Drop rows with missing values if any
df.dropna(subset=['clean_text', 'category'], inplace=True)

# Limit the data size
df = df.head(160000) # remove if wanted whole dataset

# Split the data into features
X = df['clean_text']
y = df['category']

# Split the data into training and testing sets(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorise the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=100000) #1000
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the SVM model using SGDClassifier with hinge loss function
svm_model = SGDClassifier(loss='hinge', alpha=1e-4, max_iter=1000, tol=1e-3, n_jobs=-1, learning_rate='optimal')

# Get the number of iterations for the training loop
n_iterations = 1000 #100

# Create a tqdm progress bar to track training progress (number of itterations)
with tqdm(total=n_iterations, desc="Training", unit="iter", ncols=100) as pbar:
    for i in range(n_iterations):
        svm_model.partial_fit(X_train_vec, y_train, classes=np.unique(y_train))
        pbar.update(1)

# Predict on the testing set and evaluate the model
y_pred = svm_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

# Save the trained model and vectorizer into the folder
dump(svm_model, '160k SVM model/svm_model.joblib')
dump(vectorizer, '160k SVM model/tfidf_vectorizer.joblib')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from joblib import load
from tqdm import tqdm
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import numpy as np

stock_data = pd.read_csv('data/stock_yfinance_data.csv')
tweets_df = pd.read_csv('data/stock_tweets.csv')

svm_model = load('160k SVM model/svm_model.joblib')
vectorizer = load('160k SVM model/tfidf_vectorizer.joblib')

# Plot stock prices and weekly average sentiment over time
def plot_stock_and_weekly_sentiment(stock_name):
    # Filter data for the selected stock name
    stock_data_filtered = stock_data[stock_data['Stock Name'] == stock_name].copy()
    stock_data_filtered['Date'] = pd.to_datetime(stock_data_filtered['Date'], format='%d/%m/%Y')
    stock_data_filtered.set_index('Date', inplace=True)

    # Filter tweets for the selected stock name
    desired_stock_tweets = tweets_df[tweets_df['Stock Name'] == stock_name]

    # Do sentiment analysis with the pretrained SVM model, track with tqdm progress bar
    sentiments = []
    with tqdm(total=len(desired_stock_tweets), desc="Processing Tweets", ncols=100) as pbar:
        for index, row in desired_stock_tweets.iterrows():
            tweet_vec = vectorizer.transform([row['Tweet']])
            sentiment = svm_model.predict(tweet_vec)[0]
            sentiments.append(sentiment)
            pbar.update(1)
    desired_stock_tweets['Sentiment'] = sentiments

    # Convert Date column to datetime, set as index for tweets dataframe
    desired_stock_tweets['Date'] = pd.to_datetime(desired_stock_tweets['Date'], format='%Y-%m-%d %H:%M:%S%z')
    desired_stock_tweets.set_index('Date', inplace=True)

    # Group tweets by week and calculate mean sentiment for each week
    mean_sentiments_per_week = desired_stock_tweets.groupby(pd.Grouper(freq='W')).agg({'Sentiment': 'mean'})

    # Group stock data by week and calculate mean stock price for each week
    mean_stock_prices_per_week = stock_data_filtered['Adj Close'].resample('W').mean()

    # Graphing
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot stock price
    color = 'tab:blue'
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Weekly Mean Stock Prices', color=color)
    ax1.plot(mean_stock_prices_per_week.index, mean_stock_prices_per_week.values, color=color, linewidth=2.0, label='Weekly Mean Stock Prices')
    ax1.tick_params(axis='y', labelcolor=color)

    # Add weekly average sentiment scores on the stock price plot
    ax2 = ax1.twinx()
    color = 'tab:green'
    ax2.set_ylabel('Weekly Mean Sentiment Score', color=color)
    ax2.plot(mean_sentiments_per_week.index, mean_sentiments_per_week['Sentiment'], color=color, label='Weekly Mean Sentiment Score')
    ax2.tick_params(axis='y', labelcolor=color)

    # Train SVM model to predict stock prices after the red line (end date)
    X = mean_sentiments_per_week.values.reshape(-1, 1)
    y = mean_stock_prices_per_week.values

    # Include sentiment changes from two weeks prior to red line (hotcoded supervised addition)
    X_shifted = np.roll(X, 2)
    X_combined = np.column_stack((X, X_shifted))

    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

    # Find the index relating to the end date
    end_date = mean_stock_prices_per_week.index.max() - pd.DateOffset(months=1)
    intersection_index = mean_stock_prices_per_week.index.get_loc(end_date, method='nearest')

    # Train SVM model on data before the red line (end date)
    X_before_red_line = X_combined[:intersection_index]
    y_before_red_line = y[:intersection_index]
    svm_model_after_red_line = SVR(kernel='rbf')
    svm_model_after_red_line.fit(X_before_red_line, y_before_red_line)

    # Predict stock prices after the red line (end date)
    predicted_stock_prices_after_red_line = svm_model_after_red_line.predict(X_combined[intersection_index:])

    # Change the intercept to match the stock price at the red line (end date)
    intercept_adjustment = mean_stock_prices_per_week.values[intersection_index] - predicted_stock_prices_after_red_line[0]
    predicted_stock_prices_after_red_line += intercept_adjustment

    # Plot the predicted stock prices after the red line (end date)
    ax1.plot(mean_stock_prices_per_week.index[intersection_index:], predicted_stock_prices_after_red_line, color='orange', linewidth=2.0, label='Predicted Mean Weekly Stock Prices')

    # Plot the red line with a dashes
    ax1.axvline(x=mean_stock_prices_per_week.index[intersection_index], color='red', linestyle='--', linewidth=2.0, label='End Date of Model Training')

    # Set graph x-axis limits (removes margins/white space)
    ax1.set_xlim(mean_stock_prices_per_week.index[0], mean_stock_prices_per_week.index[-1])

    # Combine legend for both axes
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='upper left')

    fig.tight_layout()
    plt.title(f'Stock Price and Weekly Average Sentiment for {stock_name}')
    plt.show()

# Main call function
def main():
    unique_stock_names = stock_data['Stock Name'].unique()
    print("Possible Stock Ticker Names:", ' '.join(unique_stock_names))
    desired_stock_ticker = input("\nEnter the name of the stock ticker you'd like to analyze: ")
    if desired_stock_ticker not in unique_stock_names:
        print("Invalid stock ticker name. Please select from the list of possible stock ticker names.")
        return
    plot_stock_and_weekly_sentiment(desired_stock_ticker)

if __name__ == "__main__":
    main()
