In [1]:
from tf_idf_new import PumpDetection
import pandas as pd
import numpy as np
import re

# This python notebook is to test the model on another dataset of tweets

We will be using the stock market dataset to detect any price anomalies (detecting potential pump using Isolation Forest) and test the model

We're detecting pump-and-dump schemes on Twitter by correlating NLP model predictions of manipulative tweets with stock market price anomalies identified via Isolation Forest. This analysis evaluates the model's effectiveness in capturing these anomalies, assesses false positive/negative rates, analyzes time-based patterns, and identifies key features and user behaviors associated with these schemes. We're aiming to quantify the model's impact on detecting market manipulation by comparing predicted pump-and-dump tweets to significant price fluctuations, while acknowledging potential challenges like causality vs. correlation, time lags, and data noise.

In [2]:
test_data = pd.read_csv(r"../data/stock_tweets_test_on_real_data.csv")
stock_data = pd.read_csv(r"../data/stock_yfinance_data_test_on_real_data.csv")

## Detecting Price Anomalies

In [5]:
from sklearn.ensemble import IsolationForest
def detect_pump_and_dump(stock_prices, tweets, lags=[1, 3, 5]):
    """Detects potential pump-and-dump schemes."""

    stock_prices['Date'] = pd.to_datetime(stock_prices['Date']) #convert to datetime
    tweets['Date'] = pd.to_datetime(pd.to_datetime(tweets['Date']).dt.date) #extract only the date

    for lag in lags:
        stock_prices[f'price_change_{lag}'] = stock_prices['Close'].pct_change(periods=lag) #Capital C
        stock_prices[f'volume_change_{lag}'] = stock_prices['Volume'].pct_change(periods=lag) #Capital V

    # Use all price and volume change columns for anomaly detection
    price_volume_features = [col for col in stock_prices.columns if 'change' in col]

    stock_prices['price_anomaly'] = IsolationForest().fit_predict(stock_prices[price_volume_features])

    merged_data = pd.merge(stock_prices, tweets, on=['Date', 'Stock Name'], how='inner')


    return merged_data

pump_dump_data = detect_pump_and_dump(stock_data, test_data, lags=[1, 3, 5])
pump_dump_data['price_anomaly'] = (pump_dump_data['price_anomaly'] == -1).astype(int)

In [6]:
pump_dump_data["price_anomaly"].value_counts()

price_anomaly
0    57901
1     5775
Name: count, dtype: int64

## NLP Model Prediction

In [7]:
rf_path = [
    r"model/random_forest_classification_pipeline.joblib",
    r"model/random_forest_classification_vectorizer.joblib"
]

model = PumpDetection(rf_path[0], rf_path[1])

In [8]:
pump_dump_data["model_prediction"] = model.predict(pump_dump_data["Tweet"])

Translation Error: No features in text.


In [10]:
pump_dump_data["model_prediction"].value_counts()

model_prediction
0    35653
1    28023
Name: count, dtype: int64

In [12]:
total_price_anomaly = pump_dump_data["price_anomaly"].sum()
predicted_pump_tweet = pump_dump_data[pump_dump_data["model_prediction"] == 1]
number_of_predicted_pump_tweet_resulted_in_price_anomaly = predicted_pump_tweet["price_anomaly"].sum()

percentage_captured = (number_of_predicted_pump_tweet_resulted_in_price_anomaly / total_price_anomaly) * 100
print(f"Total Price Anomaly: {total_price_anomaly}")
print(f"Price Anomaly Captured by Predicted Pump Tweets: {number_of_predicted_pump_tweet_resulted_in_price_anomaly}")
print(f"Percentage of Price Anomaly Captured: {percentage_captured:.2f}%")

Total Price Anomaly: 5775
Price Anomaly Captured by Predicted Pump Tweets: 2521
Percentage of Price Anomaly Captured: 43.65%


In [13]:
false_positives = len(pump_dump_data[(pump_dump_data["model_prediction"] == 1) & (pump_dump_data["price_anomaly"] == 0)])
false_negatives = len(pump_dump_data[(pump_dump_data["model_prediction"] == 0) & (pump_dump_data["price_anomaly"] == 1)])

print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")

False Positives: 25502
False Negatives: 3254


In [14]:
import matplotlib.pyplot as plt

# Assuming 'timestamp' column exists
pump_dump_data.plot(x="timestamp", y="price_anomaly", figsize=(12, 6))
plt.title("Price Anomaly Over Time")
plt.xlabel("Timestamp")
plt.ylabel("Price Anomaly")
plt.show()

KeyError: 'timestamp'

In [16]:
correlation_matrix = pump_dump_data[["price_anomaly", "model_prediction"]].corr()
print(correlation_matrix)

                  price_anomaly  model_prediction
price_anomaly          1.000000         -0.002259
model_prediction      -0.002259          1.000000


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
recall = recall_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
f1 = f1_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])
roc_auc = roc_auc_score(pump_dump_data["price_anomaly"], pump_dump_data["model_prediction"])

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")

Precision: 0.09
Recall: 0.44
F1-score: 0.15
ROC AUC: 0.50


Your model evaluation strategy centers on correlating NLP-detected pump-and-dump tweets with significant stock market price anomalies, identified through Isolation Forest. This method moves beyond standard NLP metrics by directly linking tweet predictions to real-world market impact. We quantify the model's success by measuring the percentage of total price anomalies captured by its predictions, effectively assessing its ability to identify tweets that coincide with manipulative market activity.

Furthermore, we analyze false positives and false negatives to understand the model's precision and recall in this context. Time-based and user-based patterns are examined to provide deeper insights into when and where the model performs best, and which users are most associated with potential manipulation. Standard classification metrics like precision, recall, F1-score, and ROC AUC are also employed to provide a comprehensive evaluation.

This approach evaluates the models by assessing their ability to predict tweets that directly correlate with unusual market behavior, a strong indicator of manipulation. This provides a more tangible measure of the model's effectiveness in detecting pump-and-dump schemes compared to traditional NLP metrics, as it focuses on the real-world financial impact of the predicted tweets.