In [3]:
!pip install yfinance -q
!pip install gensim -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import numpy as np
import pandas as pd
import nltk
import spacy
import random
import requests
import time
import yfinance as yf
from datetime import timedelta


from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,
                            precision_score,
                            recall_score,
                            f1_score)
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [5]:
nltk.download('stopwords')
nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")

ENGLISH_STOP_WORDS = set( stopwords.words('english') ).union(set(ENGLISH_STOP_WORDS))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [61]:
def classification_metrics(y_train, pred_train, y_test, pred_test):
  print(f"Accuracy_train: {accuracy_score(y_train, pred_train)}")
  print(f"Precision_train: {precision_score(y_train, pred_train)}")
  print(f"Recall_train: {recall_score(y_train, pred_train)}")
  print(f"F1_train: {f1_score(y_train, pred_train)}")
  print("----------------------")
  print(f"Accuracy_test: {accuracy_score(y_test, pred_test)}")
  print(f"Precision_test: {precision_score(y_test, pred_test)}")
  print(f"Recall_test: {recall_score(y_test, pred_test)}")
  print(f"F1_test: {f1_score(y_test, pred_test)}")

# Best model

The best model is selected based on experiments conducted in https://github.com/AlexIvlev/analytics_platform/blob/main/notebooks/baseline/YP_2024_Reddit_baseline.ipynb


In [92]:
df = pd.read_parquet("/content/drive/MyDrive/datasets/reddit_parser_2024_12_06_prices.parquet")

In [93]:
def preprocess_data(df):
    df['target'] = np.where(df['price_1d'] > df['created_price'], 1, 0)

    drop_cols = ['id', 'title', 'url', 'created_utc', 'parsed_utc',
                 'text', 'parent_id', 'clean_text', 'processed_text',
                 'entities', 'tickers', 'price_1d', 'doc_embedding']

    df = df.drop(columns=drop_cols, errors='ignore')

    df.rename(columns={"processed_text_length": "text_length"}, errors='ignore', inplace=True)

    return df

df = preprocess_data(df)

In [95]:
feature_cols = [col for col in df.columns if col != 'target']
target_col = 'target'

categorical_features = ['subreddit', 'type', 'ticker']
numerical_features = [col for col in feature_cols if col not in categorical_features]

X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], test_size=0.2, random_state=69)

In [96]:
best_parameters = {
    'n_estimators': 460,
    'learning_rate': 0.10270500950873558,
    'max_depth': 9,
    'min_samples_split': 10,
    'min_samples_leaf': 4,
    'subsample': 0.9995540338327519,
    'max_features': None
}

In [97]:
preprocessor = ColumnTransformer(
    transformers=[
        ('impute', SimpleImputer(strategy='constant', fill_value=0), ['num_comments']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', 'passthrough', [col for col in numerical_features if col != 'num_comments'])
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(**best_parameters))
])

In [98]:
pipeline.fit(X_train, y_train)
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

classification_metrics(y_train, pred_train, y_test, pred_test)

Accuracy_train: 0.9873737373737373
Precision_train: 0.9885057471264368
Recall_train: 0.9858220211161388
F1_train: 0.9871620601117655
----------------------
Accuracy_test: 0.7713776722090261
Precision_test: 0.7767332549941246
Recall_test: 0.772196261682243
F1_test: 0.7744581136496778




In [99]:
import joblib
joblib.dump(pipeline, '/content/drive/MyDrive/datasets/reddit_gbc.pkl')

['/content/drive/MyDrive/datasets/reddit_gbc.pkl']

# Inference data pipeline

1. Run https://github.com/AlexIvlev/analytics_platform/blob/main/parsers/reddit/reddit_parser.py to obtain the initial set of data

In [45]:
df = pd.read_parquet("/content/drive/MyDrive/datasets/reddit_parser_2025-03-10_23-18-05.parquet")
df.shape

(16792, 11)

In [None]:
def preprocess_data(df):
  # Remove rows with irrelevant content
  df = df[~df['text'].isin(['', '[]', '[deleted]', '['])]

  df['text_length'] = df['text'].apply(len)

  # Sentiment Analysis
  from nltk.sentiment import SentimentIntensityAnalyzer
  sia = SentimentIntensityAnalyzer()
  df['sentiment_scores'] = df['text'].apply(lambda x: sia.polarity_scores(x).get('compound'))

  df['clean_text'] = df['text'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word not in ENGLISH_STOP_WORDS]))

  # Text Processing (Lemmatization + Stemming)
  stemmer = PorterStemmer()

  def process_text(text):
      doc = nlp(text)
      processed_words = [stemmer.stem(token.lemma_) for token in doc if token.is_alpha]
      return " ".join(processed_words)

  df['processed_text'] = df['clean_text'].apply(process_text)

  df = df[~df['processed_text'].isin([''])]

  # Named Entity Recognition (NER)
  def get_entities(text):
      doc = nlp(text)
      return list(set(ent.text for ent in doc.ents if ent.label_ == 'ORG'))

  df['entities'] = df['text'].apply(get_entities)
  df = df[df['entities'].apply(lambda x: len(x) != 0)]

  # SEC Ticker List Filtering
  url = "https://www.sec.gov/files/company_tickers.json"
  headers = {"User-Agent": "john doe johndoe@gmail.com"}
  response = requests.get(url, headers=headers)
  response.raise_for_status()
  sec_data = response.json()

  sec_tickers = {entry['ticker'].upper() for entry in sec_data.values()}
  sec_tickers.add('^GSPC')

  def process_entities(entities):
      return [e.upper() if e != "S&P" else "^GSPC" for e in entities if e.upper() in sec_tickers]

  df['tickers'] = df['entities'].apply(process_entities)
  df = df[df['tickers'].apply(lambda x: len(x) != 0)]

  # Select random ticker from tickers as the 'main' ticker
  df['ticker'] = df['tickers'].apply(lambda x: random.choice(x))


  # Doc2Vec Embeddings
  df['id'] = df['id'].astype(str)

  tagged_data = [TaggedDocument(words=text.split(), tags=[doc_id]) for text, doc_id in zip(df['processed_text'], df['id'])]

  model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
  model.build_vocab(tagged_data)
  model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

  df['doc_embedding'] = df['id'].map(lambda x: model.dv[x])

  return df

df = preprocess_data(df)

## Add prices (do it at least 2 days after parsing)

In [48]:
df['created_utc'] = pd.to_datetime(df['created_utc'])
df = df.reset_index(drop=True)

df['created_price'] = np.nan
df['price_1d'] = np.nan

In [49]:
def get_ticker_price(ticker, target_time, price_type='Close', interval='30m', sleep_seconds=0):
  date = target_time.date()
  prices = yf.download(ticker, start=str(date), end=str(date + pd.Timedelta(days=1)), interval=interval, progress=False)

  if prices.empty:
    return 0

  prices.index = prices.index.tz_localize(None)

  prices['time_diff'] = abs(prices.index - target_time)
  closest_time = prices['time_diff'].idxmin()
  closest_row = prices.loc[closest_time]

  time.sleep(sleep_seconds)

  try:
    return closest_row[price_type, ticker]
  except KeyError:
    print("KeyError occurred, returning 0")
    return 0

In [50]:
def add_prices(df, price_col, date_col, plus_days=0):

  start_index = 0

  for i in range(start_index, df.shape[0], 100):
      print(i)
      df_tick_slice = df.iloc[i:i+100].copy()

      df_tick_slice[price_col] = df_tick_slice.apply(lambda x: get_ticker_price(x['ticker'], x[date_col] + pd.Timedelta(days=plus_days)) if pd.isnull(x[price_col]) else x[price_col], axis=1)

      df.loc[df_tick_slice.index, price_col] = df_tick_slice[price_col]

      # Save to parquet after each batch
      df.to_parquet('/content/drive/MyDrive/datasets/reddit_parser_2025_03_10_processed.parquet', index=False)


In [None]:
add_prices(df, 'created_price', 'created_utc', 0)
df = df[df['created_price'].notnull() & (df['created_price'] != 0)]
add_prices(df, 'price_1d', 'created_utc', 1)
df = df[df['price_1d'].notnull() & (df['price_1d'] != 0)]

In [54]:
df_inference = df.copy()

In [None]:
# df_2nd_stage.to_parquet('/content/drive/MyDrive/datasets/reddit_parser_2024_12_06_prices.parquet', index=False)

### Inference

In [70]:
df_inference['target'] = np.where(df_inference['price_1d'] > df_inference['created_price'], 1, 0)

In [71]:
def preprocess_data(df):
    df['target'] = np.where(df['price_1d'] > df['created_price'], 1, 0)

    drop_cols = ['id', 'title', 'url', 'created_utc', 'parsed_utc',
                 'text', 'parent_id', 'clean_text', 'processed_text',
                 'entities', 'tickers', 'price_1d', 'doc_embedding']

    df = df.drop(columns=drop_cols, errors='ignore')

    return df

df = preprocess_data(df_inference)

In [75]:
feature_cols = [col for col in df.columns if col != 'target']
target_col = 'target'

X_inf, y_inf = df[feature_cols], df[target_col]

In [100]:
pipeline = joblib.load('/content/drive/MyDrive/datasets/reddit_gbc.pkl')

In [101]:
pred_inference = pipeline.predict(X_inf)



In [102]:
  print(f"Accuracy_inference: {accuracy_score(y_inf, pred_inference)}")
  print(f"F1_inference: {f1_score(y_inf, pred_inference)}")

Accuracy_inference: 0.4121510673234811
F1_inference: 0.4371069182389937
