In [1]:
from tf_idf_new import PumpDetection
import pandas as pd
import numpy as np
import re

# This python notebook is to test the model on another dataset of tweets

We will be using the stock market dataset to detect any price anomalies (detecting potential pump using Isolation Forest) and test the model

We're detecting pump-and-dump schemes on Twitter by correlating NLP model predictions of manipulative tweets with stock market price anomalies identified via Isolation Forest. This analysis evaluates the model's effectiveness in capturing these anomalies, assesses false positive/negative rates, analyzes time-based patterns, and identifies key features and user behaviors associated with these schemes. We're aiming to quantify the model's impact on detecting market manipulation by comparing predicted pump-and-dump tweets to significant price fluctuations, while acknowledging potential challenges like causality vs. correlation, time lags, and data noise.

In [2]:
test_data = pd.read_csv(r"../data/stock_tweets_test_on_real_data.csv")
stock_data = pd.read_csv(r"../data/stock_yfinance_data_test_on_real_data.csv")

## Detecting Price Anomalies

In [3]:
from sklearn.ensemble import IsolationForest
def detect_pump_and_dump(stock_prices, tweets, lags=[1, 3, 5]):
    """Detects potential pump-and-dump schemes."""

    stock_prices['Date'] = pd.to_datetime(stock_prices['Date']) #convert to datetime
    tweets['Date'] = pd.to_datetime(pd.to_datetime(tweets['Date']).dt.date) #extract only the date

    for lag in lags:
        stock_prices[f'price_change_{lag}'] = stock_prices['Close'].pct_change(periods=lag) #Capital C
        stock_prices[f'volume_change_{lag}'] = stock_prices['Volume'].pct_change(periods=lag) #Capital V

    # Use all price and volume change columns for anomaly detection
    price_volume_features = [col for col in stock_prices.columns if 'change' in col]

    stock_prices['price_anomaly'] = IsolationForest().fit_predict(stock_prices[price_volume_features])

    merged_data = pd.merge(stock_prices, tweets, on=['Date', 'Stock Name'], how='inner')


    return merged_data

pump_dump_data = detect_pump_and_dump(stock_data, test_data, lags=[1, 3, 5])
pump_dump_data['price_anomaly'] = (pump_dump_data['price_anomaly'] == -1).astype(int)

In [4]:
pump_dump_data["price_anomaly"].value_counts()

price_anomaly
0    57285
1     6391
Name: count, dtype: int64

## NLP Model Prediction

In [5]:
rf_path = [
    r"model/gradient_boosting_balanced_best_w_smote_pipeline.joblib",
    r"model/gradient_boosting_balanced_best_w_smote_vectorizer.joblib"
]

model = PumpDetection(rf_path[0], rf_path[1])

In [6]:
pump_dump_data["model_prediction"] = model.predict(pump_dump_data["Tweet"])

Translation Error: No features in text.


In [7]:
pump_dump_data["model_prediction"].value_counts()

model_prediction
0    58637
1     5039
Name: count, dtype: int64

In [8]:
total_price_anomaly = pump_dump_data["price_anomaly"].sum()
predicted_pump_tweet = pump_dump_data[pump_dump_data["model_prediction"] == 1]
number_of_predicted_pump_tweet_resulted_in_price_anomaly = predicted_pump_tweet["price_anomaly"].sum()

percentage_captured = (number_of_predicted_pump_tweet_resulted_in_price_anomaly / total_price_anomaly) * 100
print(f"Total Price Anomaly: {total_price_anomaly}")
print(f"Price Anomaly Captured by Predicted Pump Tweets: {number_of_predicted_pump_tweet_resulted_in_price_anomaly}")
print(f"Percentage of Price Anomaly Captured: {percentage_captured:.2f}%")

Total Price Anomaly: 6391
Price Anomaly Captured by Predicted Pump Tweets: 538
Percentage of Price Anomaly Captured: 8.42%


In [9]:
predicted_pump_tweet.to_csv(r"../data/flagged_tweets_validation.csv")

In [10]:
predicted_pump_tweet

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,price_change_1,volume_change_1,price_change_3,volume_change_3,price_change_5,volume_change_5,price_anomaly,Tweet,Company Name,model_prediction
10,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,,,,,,,0,Decided to take Uber Black to the service cent...,"Tesla, Inc.",1
16,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,,,,,,,0,Anyone @CNBC @jimcramer @Lebeaucarnews want to...,"Tesla, Inc.",1
26,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,,,,,,,0,Has anyone ever published a credible explanati...,"Tesla, Inc.",1
27,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,,,,,,,0,Tesla Giga Berlin to Receive Final Approval fo...,"Tesla, Inc.",1
30,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,,,,,,,0,$TSLA China numbers starting to leak for Septe...,"Tesla, Inc.",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63623,2022-03-07,29.190001,29.950001,26.950001,27.209999,27.209999,19520200,XPEV,-0.078564,0.790910,-0.209471,1.004333,-0.251856,0.999344,1,"On March 28, 2022, we will announce our Q4 fin...",XPeng Inc.,1
63626,2022-03-14,20.020000,21.799999,18.010000,19.750000,19.750000,31384500,XPEV,-0.137931,0.268866,-0.310646,1.572226,-0.274164,0.607796,1,Norway's central bank boosts positions in NIO ...,XPeng Inc.,1
63645,2022-04-01,30.000000,30.150000,28.230000,29.190001,29.190001,14765800,XPEV,0.057992,0.601306,0.035106,0.075793,0.079113,0.053128,0,BREAKING: China considers allowing full US acc...,XPeng Inc.,1
63650,2022-05-02,24.340000,25.719999,24.070000,25.389999,25.389999,7735200,XPEV,0.031694,-0.290375,0.070405,-0.270993,0.048740,-0.118727,0,*NIO APRIL EV DELIVERES DECLINE ALMOST 50% M/M...,XPeng Inc.,1


In [11]:
false_positives = len(pump_dump_data[(pump_dump_data["model_prediction"] == 1) & (pump_dump_data["price_anomaly"] == 0)])
false_negatives = len(pump_dump_data[(pump_dump_data["model_prediction"] == 0) & (pump_dump_data["price_anomaly"] == 1)])

print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")

False Positives: 4501
False Negatives: 5853


In [12]:
correlation_matrix = pump_dump_data[["price_anomaly", "model_prediction"]].corr()
print(correlation_matrix)

                  price_anomaly  model_prediction
price_anomaly          1.000000          0.006243
model_prediction       0.006243          1.000000


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
recall = recall_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
f1 = f1_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
roc_auc = roc_auc_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")

Precision: 0.11
Recall: 0.08
F1-score: 0.09
ROC AUC: 0.50


Your model evaluation strategy centers on correlating NLP-detected pump-and-dump tweets with significant stock market price anomalies, identified through Isolation Forest. This method moves beyond standard NLP metrics by directly linking tweet predictions to real-world market impact. We quantify the model's success by measuring the percentage of total price anomalies captured by its predictions, effectively assessing its ability to identify tweets that coincide with manipulative market activity.

Furthermore, we analyze false positives and false negatives to understand the model's precision and recall in this context. Time-based and user-based patterns are examined to provide deeper insights into when and where the model performs best, and which users are most associated with potential manipulation. Standard classification metrics like precision, recall, F1-score, and ROC AUC are also employed to provide a comprehensive evaluation.

This approach evaluates the models by assessing their ability to predict tweets that directly correlate with unusual market behavior, a strong indicator of manipulation. This provides a more tangible measure of the model's effectiveness in detecting pump-and-dump schemes compared to traditional NLP metrics, as it focuses on the real-world financial impact of the predicted tweets.