In [4]:
from tf_idf_test import RecommendationSystem
import pandas as pd
import numpy as np
import re

In [5]:
test_data = pd.read_csv(r"../data/stock_tweets_test_on_real_data.csv")
print(test_data.columns)
print(test_data.head())


Index(['Date', 'Tweet', 'Stock Name', 'Company Name'], dtype='object')
                        Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  
0  Mainstream media has done an amazing job at br...       TSLA  Tesla, Inc.  
1  Tesla delivery estimates are at around 364k fr...       TSLA  Tesla, Inc.  
2  3/ Even if I include 63.0M unvested RSUs as of...       TSLA  Tesla, Inc.  
3  @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...       TSLA  Tesla, Inc.  
4  @RealDanODowd @Tesla Stop trying to kill kids,...       TSLA  Tesla, Inc.  


In [18]:
lr_path = [
    r"../model_training/tfidf_training/lr_pipeline.pkl",
    r"../model_training/tfidf_training/lr_vectorizer.pkl"
]
gb_path = [
    r"../model_training/tfidf_training/gb_pipeline.pkl",
    r"../model_training/tfidf_training/gb_vectorizer.pkl"
]
lr_model = RecommendationSystem(lr_path[0], lr_path[1])
gb_model = RecommendationSystem(gb_path[0], gb_path[1])

In [7]:
test_data

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."
...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.


In [8]:
stock_data = pd.read_csv(r"../data/stock_yfinance_data_test_on_real_data.csv")
print(stock_data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Stock Name'],
      dtype='object')


In [19]:
from sklearn.ensemble import IsolationForest
def detect_pump_and_dump(stock_prices, tweets, lags=[1, 3, 5]):
    """Detects potential pump-and-dump schemes."""

    stock_prices['Date'] = pd.to_datetime(stock_prices['Date']) #convert to datetime
    tweets['Date'] = pd.to_datetime(pd.to_datetime(tweets['Date']).dt.date) #extract only the date

    for lag in lags:
        stock_prices[f'price_change_{lag}'] = stock_prices['Close'].pct_change(periods=lag) #Capital C
        stock_prices[f'volume_change_{lag}'] = stock_prices['Volume'].pct_change(periods=lag) #Capital V

    # Use all price and volume change columns for anomaly detection
    price_volume_features = [col for col in stock_prices.columns if 'change' in col]

    stock_prices['price_anomaly'] = IsolationForest().fit_predict(stock_prices[price_volume_features])

    merged_data = pd.merge(stock_prices, tweets, on=['Date', 'Stock Name'], how='inner')


    return merged_data

pump_dump_data = detect_pump_and_dump(stock_data, test_data, lags=[1, 3, 5])

In [20]:
pump_dump_data["price_anomaly"].value_counts()

price_anomaly
 1    57369
-1     6307
Name: count, dtype: int64

In [21]:
lr_prediction = lr_model.predict(pump_dump_data["Tweet"])
gb_prediction = gb_model.predict(pump_dump_data["Tweet"])

pump_dump_data["lr_prediction"] = lr_prediction
pump_dump_data["gb_prediction"] = gb_prediction

In [26]:
print(pump_dump_data["gb_prediction"].value_counts())
print(pump_dump_data["lr_prediction"].value_counts())

gb_prediction
0    61488
1     2188
Name: count, dtype: int64
lr_prediction
0    52860
1    10816
Name: count, dtype: int64


In [23]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
def get_accuracy(df, model_type = "lr"):
    model_col = "gb_prediction"
    if model_type == "lr":
        model_col = "lr_prediction"
    pump_dump_data = df.copy()
    pump_dump_data['price_anomaly'] = pump_dump_data["price_anomaly"] == -1
    actual = pump_dump_data['price_anomaly']
    predicted = pump_dump_data[model_col]
    print(f"Precision: {precision_score(actual, predicted)}")
    print(f"Recall: {recall_score(actual, predicted)}")
    print(f"Accuracy: {accuracy_score(actual, predicted)}")
    print(f"F1 Score: {f1_score(actual, predicted)}")

In [24]:
get_accuracy(pump_dump_data, "lr")

Precision: 0.09606139053254438
Recall: 0.16473759315046774
Accuracy: 0.76372573654124
F1 Score: 0.12135723880161187


In [25]:
get_accuracy(pump_dump_data, "gb")

Precision: 0.09963436928702012
Recall: 0.034564769303947994
Accuracy: 0.8734374018468497
F1 Score: 0.051324308416715717
