In [None]:
# Dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import pipeline
from scipy.stats import entropy
import torch
from gnews import GNews
import yfinance as yf
from datetime import date,datetime, timedelta
import time
import re
import os
from tqdm import tqdm
import gc

### Data Loading

##### Data Loading of Stock tweets data

In [None]:
file_path0="/content/stock_tweets.csv"

In [None]:
df0 = pd.read_csv(file_path0)
df0.head()

In [None]:
df0.shape

In [None]:
df0.info()

In [None]:
df0.describe(include="all")

In [None]:
df0.columns

In [None]:
# Ascending order (earliest first)
print("Five earliest dates:")
print(df0['Date'].sort_values().head(5))

# Descending order (latest first)
print("\nFive latest dates:")
print(df0['Date'].sort_values(ascending=False).head(5))

###### Data is from september 2021 to september 2022

##### Data Loading of Stock OHCLV data

In [None]:
file_path="/content/stock_yfinance_data.csv"

In [None]:
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.columns

In [None]:
print("Data index")
print("\nPrint ascending order dates")
print(df['Date'].sort_values().head(5))
print("\nPrint descending order dates")
print(df['Date'].sort_values(ascending=False).head(5))

##### Dataset is from *Sept*-2021 to Sept-2022

### EDA

###### EDA on Stock Tweets data

In [None]:
# Convert date column into standardized form
df0['Date'] = pd.to_datetime(df0['Date'], utc=True).dt.tz_localize(None).dt.normalize()

In [None]:
df0['Company Name'] = df0['Company Name'].str.replace(' ', '')
df0['Stock Name'] = df0['Stock Name'].astype('string').str.upper()

In [None]:
df0.info()

In [None]:
# Missing
print(df0.isnull().sum())

In [None]:
# Cardinality
print("Unique dates:", df0['Date'].nunique())
print("Unique tickers:", df0['Stock Name'].nunique())
print(df0['Stock Name'].value_counts())

In [None]:
# Count tweets per ticker
ticker_counts = df0['Stock Name'].value_counts()

plt.figure(figsize=(12,6))

# Line plot
plt.plot(ticker_counts.index, ticker_counts.values, marker='o')

plt.title("Tweet Count per Ticker (Line Plot)")
plt.xlabel("Stock Name")
plt.ylabel("Number of Tweets")
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Class Imbalance
before_counts = df0['Stock Name'].value_counts()
min_count = before_counts.min()

df_balanced = (
    df0.groupby('Stock Name', group_keys=False)
       .apply(lambda x: x.sample(min_count, random_state=42))
)
balanced_counts = df_balanced['Stock Name'].value_counts()

plt.figure(figsize=(12,5))
plt.bar(balanced_counts.index, balanced_counts.values)
plt.title("Class Distribution After Balancing (Downsampling)")
plt.xlabel("Stock Name")
plt.ylabel("Number of Tweets")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df0['tweet_len'] = df0['Tweet'].str.len()

plt.figure(figsize=(10,4))
sns.histplot(df0['tweet_len'], bins=50, kde=True)
plt.title("Tweet length distribution")
plt.show()

In [None]:
tweets_per_day = df0.groupby('Date').size()

plt.figure(figsize=(12,4))
tweets_per_day.plot()
plt.title("Tweets per day")
plt.ylabel("Count")
plt.show()

##### EDA on Stock OHCLV data

In [None]:
# Convert date column into standardized form
df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.tz_localize(None).dt.normalize()

In [None]:
#Convert Variables into standard format
df['Stock Name'] = df['Stock Name'].astype('string').str.upper()

In [None]:
df.isnull().sum()

In [None]:
df['has_space'] = df['Stock Name'].str.contains(' ')
print(df[df['has_space'] == 'True'])

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(3,2,1)
sns.histplot(df['Open'],kde=True)
plt.xlabel('Open')
plt.subplot(3,2,2)
sns.histplot(df['High'],kde=True)
plt.xlabel('High')
plt.subplot(3,2,3)
sns.histplot(df['Low'],kde=True)
plt.xlabel('Low')
plt.subplot(3,2,4)
sns.histplot(df['Close'],kde=True)
plt.xlabel('Close')
plt.subplot(3,2,5)
sns.histplot(df['Adj Close'],kde=True)
plt.xlabel('Adj Close')
plt.subplot(3,2,6)
sns.histplot(df['Volume'],kde=True)
plt.xlabel('Volume')
plt.tight_layout()
plt.show()

In [None]:
df_outliers = df.copy()
numerical_columns = df_outliers.select_dtypes(include=['int64','float64'])
for col in numerical_columns:
  Q1 = df_outliers[col].quantile(0.25)
  Q3 = df_outliers[col].quantile(0.75)
  IQR = Q3 - Q1
  upper_bound = Q3 + 1.5 * IQR
  lower_bound = Q1 - 1.5 * IQR
  df_outliers = df_outliers[(df_outliers[col] > lower_bound) & (df_outliers[col] <= upper_bound)]
print(f"Number of outliers removed:{len(df) - len(df_outliers)}")

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(3,2,1)
sns.histplot(df_outliers['Open'],kde=True)
plt.xlabel('Open')
plt.subplot(3,2,2)
sns.histplot(df_outliers['High'],kde=True)
plt.xlabel('High')
plt.subplot(3,2,3)
sns.histplot(df_outliers['Low'],kde=True)
plt.xlabel('Low')
plt.subplot(3,2,4)
sns.histplot(df_outliers['Close'],kde=True)
plt.xlabel('Close')
plt.subplot(3,2,5)
sns.histplot(df_outliers['Adj Close'],kde=True)
plt.xlabel('Adj Close')
plt.subplot(3,2,6)
sns.histplot(df_outliers['Volume'],kde=True)
plt.xlabel('Volume')
plt.tight_layout()
plt.show()

In [None]:
counts = df_outliers['Stock Name'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(6,12))
sns.barplot(y=counts.index, x=counts.values,color='skyblue',edgecolor='black')
plt.xlabel('Stock Name')
plt.ylabel('Count')
plt.title('Stock Name Distribution')
plt.xticks(rotation=90)

In [None]:
corr_matrix = numerical_columns.corr()
print("Correlation matrix")
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of OHCLV Data')
plt.show()

#### We see that the columns are highy corelated but we are not going to remove the multicolinearity issue as we expect columns to be highly colinear for stock prediction

In [None]:
inertia_scores = []
silhouette_scores = []
n_clusters = range(2,11)

for n_cluster in n_clusters:
  kmeans = KMeans(n_clusters= n_cluster,random_state=42)
  kmeans.fit(df[['Adj Close']])
  inertia_scores.append(kmeans.inertia_)
  silhouette_scores.append(silhouette_score(df[['Adj Close']],kmeans.labels_))
print(f"Inertia Scores:{inertia_scores}")
print(f"Silhouette Scores:{silhouette_scores}")

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(2,2,1)
plt.plot(n_clusters,silhouette_scores,marker='o',color='blue')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Scores')
plt.title('Silhouette Scores vs Number of Clusters')

plt.subplot(2,2,2)
plt.plot(n_clusters,inertia_scores,marker='o',color='blue')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia Scores')
plt.title('Inertia Scores vs Number of Clusters')

plt.tight_layout()
plt.show()

In [None]:
# Need to think how we ca implement clustering


In [None]:
# Load the dataset
print("=" * 70)
print("EMOTION CLASSIFICATION - DEVICE DETECTION")
print("=" * 70)

# Detect device
device = 0 if torch.cuda.is_available() else -1

if device == 0:
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU DETECTED: {gpu_name}")
    print(f"  Memory: {gpu_memory:.2f} GB")
    device_type = f"GPU: {gpu_name}"
    # Enable GPU memory optimization
    torch.cuda.empty_cache()
else:
    print("GPU not available, using CPU")
    device_type = "CPU"

print(f"Processing device: {device_type}")
print("=" * 70)

print("\nLoading emotion classification models...")
print("-" * 70)

# Load Emtract model (DistilBERT - faster)
print("Loading Emtract (DistilBERT) model...")
emtract_classifier = pipeline(
    'text-classification',
    model='vamossyd/emtract-distilbert-base-uncased-emotion',
    top_k=None,
    device=device,
    model_kwargs={'torch_dtype': torch.float16} if device == 0 else {}
)
print(f"Emtract model loaded on {device_type}")

# Load Hartmann model (RoBERTa - more detailed)
print("Loading Hartmann (RoBERTa-large) model...")
hartmann_classifier = pipeline(
    'text-classification',
    model='j-hartmann/emotion-english-roberta-large',
    top_k=None,
    device=device,
    model_kwargs={'torch_dtype': torch.float16} if device == 0 else {}
)
print(f"Hartmann model loaded on {device_type}")
print("-" * 70)

def get_all_scores(tweet, classifier, model_name):
    if pd.isna(tweet) or not isinstance(tweet, str):
        return {f'{model_name}_Emotion': 'UNKNOWN'}

    try:
        results = classifier(tweet)
        if isinstance(results, list):
            results = results[0]
    except Exception as e:
        print(f"Error classifying tweet: {e}")
        return {f'{model_name}_Emotion': 'ERROR'}

    output = {}
    output[f'{model_name}_Emotion'] = results[0]['label']

    for item in results:
        output[f'{model_name}_{item["label"]}_Score'] = item['score']

    return output

def batch_classify(tweets, classifier, model_name, batch_size=96):
    results_list = []
    tweets_list = tweets.tolist()

    # Filter out NaN and non-string values
    valid_tweets = []
    valid_indices = []
    for idx, tweet in enumerate(tweets_list):
        if pd.notna(tweet) and isinstance(tweet, str):
            valid_tweets.append(tweet)
            valid_indices.append(idx)

    truncation_warnings = 0

    # Process in batches with optimizations
    for i in tqdm(range(0, len(valid_tweets), batch_size), desc=f"Processing {model_name}"):
        batch = valid_tweets[i:i+batch_size]
        try:
            # Use no_grad context for inference to save memory
            with torch.no_grad():
                # Process FULL tweets without truncation for better accuracy
                batch_results = classifier(batch, top_k=None)

            for result_idx, results in enumerate(batch_results):
                if not isinstance(results, list):
                    results = [results]

                output = {}
                output[f'{model_name}_Emotion'] = results[0]['label']

                for item in results:
                    output[f'{model_name}_{item["label"]}_Score'] = item['score']

                results_list.append((valid_indices[i + result_idx], output))
        except RuntimeError as e:
            # Handle token length errors gracefully
            if "token" in str(e).lower() or "length" in str(e).lower():
                truncation_warnings += len(batch)
                # Fallback: process with truncation for very long tweets
                for tweet_idx, tweet_text in enumerate(batch):
                    try:
                        with torch.no_grad():
                            result = classifier(tweet_text, truncation=True, top_k=None)
                            if isinstance(result, list):
                                result = result[0]
                        results_list.append((valid_indices[i + tweet_idx], {
                            f'{model_name}_Emotion': result[0]['label'],
                            **{f'{model_name}_{item["label"]}_Score': item['score'] for item in result}
                        }))
                    except Exception as e2:
                        results_list.append((valid_indices[i + tweet_idx], {f'{model_name}_Emotion': 'ERROR'}))
            else:
                print(f"Error processing batch: {e}")
                for result_idx in range(len(batch)):
                    results_list.append((valid_indices[i + result_idx], {f'{model_name}_Emotion': 'ERROR'}))
        except Exception as e:
            print(f"Error processing batch: {e}")
            for result_idx in range(len(batch)):
                results_list.append((valid_indices[i + result_idx], {f'{model_name}_Emotion': 'ERROR'}))

        # Periodic GPU cache cleanup (only clear GPU, not system memory with gc.collect)
        if device == 0 and (i // batch_size) % 10 == 0:
            torch.cuda.empty_cache()

    if truncation_warnings > 0:
        print(f"{truncation_warnings} tweets exceeded max token length (fallback to truncation applied)")

    # Create DataFrame with proper indexing
    results_df = pd.DataFrame([output for _, output in results_list])
    results_df.index = [idx for idx, _ in results_list]
    return results_df.reindex(range(len(tweets_list))).fillna({f'{model_name}_Emotion': 'UNKNOWN'})

# Apply classifications
print(f"\nApplying emotion classification ({device_type})...")
print("-" * 70)
print(f"Processing FULL tweets WITHOUT truncation for maximum accuracy")
print("-" * 70)

print(f"[1/2] Classifying with Emtract model (batch_size=96, full tweet processing)...")
start_time = time.time()
df0_all_emotions = df0.copy()
emtract_results = batch_classify(df0_all_emotions['Tweet'], emtract_classifier, 'Emtract', batch_size=96)
df0_all_emotions = pd.concat([df0_all_emotions, emtract_results], axis=1)
emtract_time = time.time() - start_time
print(f"Emtract completed in {emtract_time:.2f} seconds ({len(df0_all_emotions)} tweets)")
print(f"Emtract results saved in DataFrame (columns: {len(emtract_results.columns)})")

# Delete intermediate variable to free memory, but data is safe in df0
del emtract_results
if device == 0:
    torch.cuda.empty_cache()

print(f"[2/2] Classifying with Hartmann model (batch_size=96, full tweet processing)...")
start_time = time.time()
hartmann_results = batch_classify(df0_all_emotions['Tweet'], hartmann_classifier, 'Hartmann', batch_size=96)
df0_all_emotions = pd.concat([df0_all_emotions, hartmann_results], axis=1)
hartmann_time = time.time() - start_time
print(f"Hartmann completed in {hartmann_time:.2f} seconds ({len(df0_all_emotions)} tweets)")
print(f"Hartmann results saved in DataFrame (columns: {len(hartmann_results.columns)})")

# Delete intermediate variable to free memory, but data is safe in df0
del hartmann_results

# Save results IMMEDIATELY after processing (data is now in df0_all_emotions)
print("\nSaving combined results to file...")
df0_all_emotions.to_parquet('stock_tweets_with_all_scores.parquet', index=False)
print(f"✓ Results saved: {df0_all_emotions.shape[0]} rows × {df0_all_emotions.shape[1]} columns")

print("-" * 70)
print("\n CLASSIFICATION COMPLETE")
print(f"  Device used: {device_type}")
print(f"  Total processing time: {emtract_time + hartmann_time:.2f} seconds")
print(f"  Tweets processed: {len(df0_all_emotions)}")
print(f"  Avg time per tweet: {((emtract_time + hartmann_time) / len(df0_all_emotions)):.4f}s")
print(f"  Processing mode: FULL tweets ")
print(f"  Output file: stock_tweets_with_all_scores.parquet")
print("=" * 70)

# Final GPU cleanup (AFTER saving)
if device == 0:
    torch.cuda.empty_cache()
gc.collect()

###### Load the sentiments in the stock tweets data

###### We have loaded the dataset with both hartmann and emtract model scores and compare the values with the help of two metrics: Shanon Entropy and Valence Bias which falls under fidelity test to validate the performance of our emtract model.

In [None]:
df2 = pd.read_parquet('/content/stock_tweets_with_all_scores.parquet')
def compute_entropy(df, prefix):
    prob_cols = [c for c in df.columns if c.startswith(prefix) and c.endswith('_Score')]
    prob_cols = sorted(prob_cols)
    probs = df[prob_cols].values
    return np.apply_along_axis(lambda x: entropy(x, base=2), 1, probs), prob_cols

# Emtract entropy
df2['Emtract_Entropy'], emtract_cols = compute_entropy(df2, 'Emtract_')

# Hartmann entropy
df2['Hartmann_Entropy'], hartmann_cols = compute_entropy(df2, 'Hartmann_')

# Final Comparison in entropy of the models
avg_emtract_h = df2['Emtract_Entropy'].mean()
avg_hartmann_h = df2['Hartmann_Entropy'].mean()
print(f"Average entropy of emtract model:{avg_emtract_h:.2f}")
print(f"Average entropy of hartmann model:{avg_hartmann_h:.2f}")

##### We can observe that the entropy of hartmann model is greater than entropy model which concludes that the hartmann model is less infrmative.

In [None]:
strong_signal_threshold = 0.5
EMOTION_ALIASES = {
    "Happy": ["happy", "happiness", "joy"],
    "Sad": ["sad", "sadness", "unhappy"],
    "Anger": ["anger", "angry", "rage"],
    "Fear": ["fear", "afraid", "anxiety"],
    "Disgust": ["disgust", "disgusted"]
}
def find_emotion_columns(df2, prefix, emotions):
    cols = []
    for col in df2.columns:
        if not col.startswith(prefix):
            continue
        if not col.endswith("_Score"):
            continue

        col_l = col.lower()
        for emo in emotions:
            for alias in EMOTION_ALIASES[emo]:
                if alias in col_l:
                    cols.append(col)
                    break
    return list(set(cols))


def compute_valence(df2, prefix):
    # Positive emotions
    pos_cols = find_emotion_columns(df2, prefix, ["Happy"])

    # Negative emotions
    neg_cols = find_emotion_columns(
        df2, prefix, ["Sad", "Anger", "Fear", "Disgust"]
    )

    if len(pos_cols) == 0 or len(neg_cols) == 0:
        raise ValueError(f"Emotion columns missing for {prefix}")

    pos_score = df2[pos_cols].sum(axis=1)
    neg_score = df2[neg_cols].sum(axis=1)

    return pos_score - neg_score, pos_cols, neg_cols
print("Calculating Valence Scores")

df2['Emtract_Valence'], em_pos, em_neg = compute_valence(df2, "Emtract")
df2['Hartmann_Valence'], ha_pos, ha_neg = compute_valence(df2, "Hartmann")
# Signal intensity (mean absolute valence)
emtract_intensity = df2['Emtract_Valence'].abs().mean()
hartmann_intensity = df2['Hartmann_Valence'].abs().mean()
# Actionable rate
emtract_actionable = (
    df2['Emtract_Valence'].abs() > strong_signal_threshold
).mean() * 100

hartmann_actionable = (
    df2['Hartmann_Valence'].abs() > strong_signal_threshold
).mean() * 100
# Contradiction rate
contradictions = df2[
    ((df2['Emtract_Valence'] > 0.1) & (df2['Hartmann_Valence'] < -0.1)) |
    ((df2['Emtract_Valence'] < -0.1) & (df2['Hartmann_Valence'] > 0.1))
]
contradiction_rate = (len(contradictions) / len(df2)) * 100

# Print results
print("\n MODEL UTILITY COMPARISON")

print("\n SIGNAL INTENSITY (Avg Absolute Valence Score: 0–1)")
print("(Higher = More decisive emotions)")
print(f"Emtract:  {emtract_intensity:.4f}")
print(f"Hartmann: {hartmann_intensity:.4f}")

print(f"\n ACTIONABLE RATE (|Valence| > {strong_signal_threshold})")
print("(Higher = More potential trading triggers)")
print(f"Emtract:  {emtract_actionable:.2f}%")
print(f"Hartmann: {hartmann_actionable:.2f}%")

print("\n CONTRADICTION RATE")
print("(How often do they completely disagree?)")
print(f"{contradiction_rate:.2f}% of tweets have opposite sentiments.")

In [None]:
# Calculating the average valence of the model
avg_emtract_valence = df2['Emtract_Valence'].mean()
avg_hartmann_valence = df2['Hartmann_Valence'].mean()

print(f"Average Emtract Valence (Bias): {avg_emtract_valence:.4f}")
print(f"Average Hartmann Valence (Bias): {avg_hartmann_valence:.4f}")

###### We can observe that the hartmann model is more negatively biased than our emtract model, concluding the fact that emtract model is more suited for stock prediction.

In [None]:
df0_model = df0.copy()

# Loading the emtract model for consumption in data pipeline
print("=" * 70)
print("EMOTION CLASSIFICATION - SIMPLIFIED PIPELINE")
print("=" * 70)

device = 0 if torch.cuda.is_available() else -1

if device == 0:
    gpu_name = torch.cuda.get_device_name(0)
    print(f" GPU DETECTED: {gpu_name}")
    torch.cuda.empty_cache()
else:
    print(" Using CPU")

emotion_classifier = pipeline('text-classification',
                             model='vamossyd/emtract-distilbert-base-uncased-emotion',
                             device=device,
                             model_kwargs={'torch_dtype': torch.float16} if device == 0 else {})

def batch_classify_emotions(tweets, classifier, batch_size=128):
    emotions = []
    tweets_list = tweets.tolist()

    # Filter valid tweets
    valid_tweets = []
    valid_indices = []
    for idx, tweet in enumerate(tweets_list):
        if pd.notna(tweet) and isinstance(tweet, str):
            valid_tweets.append(tweet)
            valid_indices.append(idx)

    # Initialize results array with default values
    results = ['UNKNOWN'] * len(tweets_list)

    # Process in batches
    print(f"\nProcessing {len(valid_tweets)} tweets in batches of {batch_size}...")
    for i in tqdm(range(0, len(valid_tweets), batch_size), desc="Classifying emotions"):
        batch = valid_tweets[i:i+batch_size]
        try:
            with torch.no_grad():
                batch_results = classifier(batch, top_k=None)

            for result_idx, result in enumerate(batch_results):
                # Extract top emotion label
                if isinstance(result, list) and len(result) > 0:
                    top_emotion = result[0]['label']
                elif isinstance(result, dict) and 'label' in result:
                    top_emotion = result['label']
                else:
                    top_emotion = 'ERROR'

                results[valid_indices[i + result_idx]] = top_emotion
        except Exception as e:
            print(f"⚠ Error in batch {i}: {str(e)}")
            for result_idx in range(len(batch)):
                results[valid_indices[i + result_idx]] = 'ERROR'

        # GPU memory cleanup
        if device == 0 and (i // batch_size) % 10 == 0:
            torch.cuda.empty_cache()

    return results

# Apply emotion classification to dataset tweets
print(f"\nDataset shape: {df0_model.shape}")
print(f"Tweet column: 'Tweet'")

start_time = time.time()
df0_model['emotion'] = batch_classify_emotions(df0_model['Tweet'], emotion_classifier, batch_size=128)
classification_time = time.time() - start_time

# Calculate statistics
emotion_counts = df0_model['emotion'].value_counts()
print("\n" + "=" * 70)
print("CLASSIFICATION SUMMARY")
print("=" * 70)
print(f"\n Classification completed in {classification_time:.2f} seconds")
print(f"  Processing speed: {len(df0_model)/classification_time:.2f} tweets/second")
print(f"\nEmotion Distribution:")
print(emotion_counts)

# Save the new dataset with emotions
df0_model.to_parquet('stock_tweets_with_emotions.parquet', index=False)
print(f"\n Dataset saved: stock_tweets_with_emotions.parquet")
print(f"  Rows: {len(df0_model)}, Columns: {len(df0_model.columns)}")
print("=" * 70)

# Final GPU cleanup
if device == 0:
    torch.cuda.empty_cache()
gc.collect()

In [None]:
# Loading the dataset with emotions loaded by emtract model
df0_final = pd.read_parquet("/content/stock_tweets_with_emotions.parquet")

In [None]:
# Print the new columns
df0_final.columns

In [None]:
# Print the structure of new dataset
df0_final.describe(include="all")

In [None]:
# Emotion distribution
sns.set(style="white")
plt.figure(figsize=(8,4))
df0_final['emotion'].value_counts().plot(kind='bar')
plt.title("Emotion distribution")
plt.ylabel("Tweets")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
top_n = 5
top_tickers = df0_final['Stock Name'].value_counts().head(top_n).index

emo_ticker = pd.crosstab(df0_final['Stock Name'], df0_final['emotion'])
emo_ticker = emo_ticker.loc[top_tickers]

emo_ticker_pct = emo_ticker.div(emo_ticker.sum(axis=1), axis=0)

plt.figure(figsize=(12,6))
emo_ticker.plot(kind='bar', stacked=True, ax=plt.gca())
plt.title("Emotions per ticker (top tickers)")
plt.ylabel("Tweets")
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df0_final.info()

In [None]:
# Merge the datasets on date parameter
merged_df = pd.merge(
    df0_final,
    df,
    on=['Date','Stock Name'],
    how='inner'
)

merged_df.drop(columns=['has_space','tweet_len'], inplace=True)
# Display top 5 in the merged dataset
merged_df.head()

In [None]:
# Display the columns in merged dataset
merged_df.columns

In [None]:
# Validate the rows in the dataset
merged_df.shape

In [None]:
# Validate whether there is null values in the merged dataset
print(merged_df.isnull().sum())

In [None]:
merged_df.to_parquet('merged_stock_tweets_yfinance.parquet', index=False)

In [None]:
tickers = ["TSLA", "TSM", "AAPL", "AMZN", "MSFT", "PG", "NIO", "META", "AMD",
           "NFLX", "GOOG", "PYPL", "DIS", "BA", "COST", "INTC", "KO", "CRM",
           "XPEV", "ENPH", "ZS", "VZ", "BX", "F", "NOC"]

all_ticker_data = []
today = date.today().strftime('%Y-%m-%d')
for ticker in tickers:
    print(f"Downloading data for {ticker}...")
    data = yf.download(ticker, start="2022-10-01", end=today, 
                       auto_adjust=False, progress=False)
    
    if not data.empty:
        if isinstance(data.columns, pd.MultiIndex):
            data.columns = data.columns.get_level_values(0)
        
        data = data.reset_index()
        data['Stock Name'] = ticker
        
        print(f"  Columns: {data.columns.tolist()}")
        all_ticker_data.append(data)
    else:
        print(f"No data found for {ticker}")

df_flat = pd.concat(all_ticker_data, ignore_index=True)

df_flat = df_flat[['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Stock Name']]

#  Export to CSV
df_flat.to_csv("ohlcv_data.csv", index=False)
print("\n SUCCESS: File saved as ohlcv_data.csv")
print(f"Shape: {df_flat.shape}")
print("\nFirst few rows:")
print(df_flat.head())

In [None]:
# --- Configuration ---
CSV_FILE = "cleaned_stock_news_2022_2026.csv"

# Date range
START_DATE = datetime(2022, 10, 1)
END_DATE = datetime.now()

tickers = ["TSLA", "TSM", "AAPL", "AMZN", "MSFT", "NVDA", "PLTR", "META", "AMD",
           "NFLX", "GOOG", "PYPL", "DIS", "BA", "COST", "INTC", "KO", "CRM",
           "XPEV", "ENPH", "ZS", "VZ", "BX", "F", "NOC"]

# Map tickers to full company names
TICKER_NAMES = {
    "TSLA": "Tesla Inc", "TSM": "Taiwan Semiconductor Manufacturing Company",
    "AAPL": "Apple Inc", "AMZN": "Amazon.com Inc", "MSFT": "Microsoft Corporation",
    "NVDA": "NVIDIA Corporation", "PLTR": "Palantir Technologies Inc",
    "META": "Meta Platforms Inc", "AMD": "Advanced Micro Devices Inc",
    "NFLX": "Netflix Inc", "GOOG": "Alphabet Inc", "PYPL": "PayPal Holdings Inc",
    "DIS": "The Walt Disney Company", "BA": "The Boeing Company",
    "COST": "Costco Wholesale Corporation", "INTC": "Intel Corporation",
    "KO": "The Coca-Cola Company", "CRM": "Salesforce Inc",
    "XPEV": "XPeng Inc", "ENPH": "Enphase Energy Inc", "ZS": "Zscaler Inc",
    "VZ": "Verizon Communications Inc", "BX": "Blackstone Inc",
    "F": "Ford Motor Company", "NOC": "Northrop Grumman Corporation"
}

# Junk keywords to filter out automated/low-quality content
JUNK_KEYWORDS = [
    "overbought", "oversold", "form 4", "insider buy", "insider sell",
    "filing", "sec filing", "nasdaq:", "nyse:", "earnings call transcript",
    "stock alert", "stock pick", "buy now", "top stocks", "click here",
    "subscribe", "sign up", "newsletter", "advertisement", "sponsored",
    "price target", "technical analysis", "moved up", "moved down",
    "unusual options", "options activity", "short interest", "stock screener",
    "benzinga", "marketbeat", "tipranks", "stock rover"
]

# Minimum word count for meaningful content
MIN_WORD_COUNT = 15

# --- Helper Functions ---

def scrub_text(text):
    """Clean the text of URLs, HTML tags, and messy whitespace."""
    if not text:
        return ""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"&\w+;", "", text)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def is_quality_content(text):
    """Filter out junk content and short tweets."""
    if not text:
        return False

    # Check minimum word count
    word_count = len(text.split())
    if word_count < MIN_WORD_COUNT:
        return False

    # Check for junk keywords
    text_lower = text.lower()
    if any(keyword in text_lower for keyword in JUNK_KEYWORDS):
        return False

    return True

def split_date_range(start_date, end_date, months_per_chunk=3):
    """Split date range into smaller chunks to bypass 100-article limit."""
    chunks = []
    current = start_date

    while current < end_date:
        chunk_end = min(current + timedelta(days=months_per_chunk * 30), end_date)
        chunks.append((current, chunk_end))
        current = chunk_end + timedelta(days=1)

    return chunks

def fetch_news_gnews(ticker, company_name, start_date, end_date):
    """Fetch news using GNews with chunked date ranges to get more articles."""
    all_news = []

    # Split into 3-month chunks to get more than 100 articles
    date_chunks = split_date_range(start_date, end_date, months_per_chunk=3)

    for chunk_start, chunk_end in date_chunks:
        try:
            google_news = GNews(
                language='en',
                country='US',
                start_date=(chunk_start.year, chunk_start.month, chunk_start.day),
                end_date=(chunk_end.year, chunk_end.month, chunk_end.day),
                max_results=100  # Max per chunk
            )

            # Search using both ticker and company name
            search_query = f"{ticker} OR {company_name} stock"
            news = google_news.get_news(search_query)

            if news:
                all_news.extend(news)
                print(f"    -> Found {len(news)} articles for {chunk_start.date()} to {chunk_end.date()}")

            time.sleep(0.5)  # Small delay between chunks

        except Exception as e:
            print(f"    -> Error for chunk {chunk_start.date()}: {e}")
            continue

    return all_news

# --- Main Logic ---

def main():
    final_data = []

    print("Starting Google News scraper with chunked date ranges...")
    print(f"Date range: {START_DATE.date()} to {END_DATE.date()}")
    print("Splitting into 3-month chunks to get MORE than 100 articles per ticker")
    print(f"Filters: Minimum {MIN_WORD_COUNT} words, removing junk keywords\n")
    print("Note: Make sure 'gnews' is installed: pip install gnews\n")

    for ticker in tickers:
        company_name = TICKER_NAMES.get(ticker, ticker)
        print(f"\nProcessing {ticker} ({company_name})...")

        news_items = fetch_news_gnews(ticker, company_name, START_DATE, END_DATE)
        count = 0

        for item in news_items:
            # Get published date
            pub_date = item.get("published date", "")

            # Get title and description
            title = item.get("title", "")
            description = item.get("description", "")

            # Combine title and description
            text = f"{title}. {description}" if description else title

            cleaned_text = scrub_text(text)

            # Apply quality filters
            if is_quality_content(cleaned_text):
                final_data.append({
                    "date": pub_date,
                    "tweet": cleaned_text,
                    "stock name": ticker,
                    "company name": company_name
                })
                count += 1

        print(f"  -> Total saved: {count} records")

    # Export to CSV
    if final_data:
        df = pd.DataFrame(final_data)
        df = df[["date", "tweet", "stock name", "company name"]]

        # Remove duplicates based on tweet content
        df = df.drop_duplicates(subset=["tweet"])

        # Sort by date
        df = df.sort_values("date")

        df.to_csv(CSV_FILE, index=False)
        print(f"\nSUCCESS: {len(df)} unique records saved to {CSV_FILE}")
        print(f"\nBreakdown by ticker:")
        ticker_counts = df["stock name"].value_counts().sort_index()
        for ticker, count in ticker_counts.items():
            print(f"  {ticker}: {count} articles")
        print(f"\nTotal unique articles: {len(df)}")
    else:
        print("\n No data found. Make sure 'gnews' is installed: pip install gnews")

if __name__ == "__main__":
    main()

In [None]:
df_test_financial_news_with_emotions = pd.read_csv("/content/cleaned_stock_news_2022_2026.csv")
# Loading the emtract model for consumption in data pipeline
print("=" * 70)
print("EMOTION CLASSIFICATION - SIMPLIFIED PIPELINE")
print("=" * 70)

device = 0 if torch.cuda.is_available() else -1

if device == 0:
    gpu_name = torch.cuda.get_device_name(0)
    print(f" GPU DETECTED: {gpu_name}")
    torch.cuda.empty_cache()
else:
    print(" Using CPU")

emotion_classifier = pipeline('text-classification',
                             model='vamossyd/emtract-distilbert-base-uncased-emotion',
                             device=device,
                             model_kwargs={'torch_dtype': torch.float16} if device == 0 else {})

def batch_classify_emotions(tweets, classifier, batch_size=128):
    emotions = []
    tweets_list = tweets.tolist()

    # Filter valid tweets
    valid_tweets = []
    valid_indices = []
    for idx, tweet in enumerate(tweets_list):
        if pd.notna(tweet) and isinstance(tweet, str):
            valid_tweets.append(tweet)
            valid_indices.append(idx)

    # Initialize results array with default values
    results = ['UNKNOWN'] * len(tweets_list)

    # Process in batches
    print(f"\nProcessing {len(valid_tweets)} tweets in batches of {batch_size}...")
    for i in tqdm(range(0, len(valid_tweets), batch_size), desc="Classifying emotions"):
        batch = valid_tweets[i:i+batch_size]
        try:
            with torch.no_grad():
                batch_results = classifier(batch, top_k=None)

            for result_idx, result in enumerate(batch_results):
                # Extract top emotion label
                if isinstance(result, list) and len(result) > 0:
                    top_emotion = result[0]['label']
                elif isinstance(result, dict) and 'label' in result:
                    top_emotion = result['label']
                else:
                    top_emotion = 'ERROR'

                results[valid_indices[i + result_idx]] = top_emotion
        except Exception as e:
            print(f"⚠ Error in batch {i}: {str(e)}")
            for result_idx in range(len(batch)):
                results[valid_indices[i + result_idx]] = 'ERROR'

        # GPU memory cleanup
        if device == 0 and (i // batch_size) % 10 == 0:
            torch.cuda.empty_cache()

    return results

# Apply emotion classification to dataset tweets
print(f"\nDataset shape: {df_test_financial_news_with_emotions.shape}")
print(f"Tweet column: 'tweet'")

start_time = time.time()
df_test_financial_news_with_emotions['emotion'] = batch_classify_emotions(df_test_financial_news_with_emotions['tweet'], emotion_classifier, batch_size=128)
classification_time = time.time() - start_time

# Calculate statistics
emotion_counts = df_test_financial_news_with_emotions['emotion'].value_counts()
print("\n" + "=" * 70)
print("CLASSIFICATION SUMMARY")
print("=" * 70)
print(f"\n Classification completed in {classification_time:.2f} seconds")
print(f"  Processing speed: {len(df_test_financial_news_with_emotions)/classification_time:.2f} tweets/second")
print(f"\nEmotion Distribution:")
print(emotion_counts)

# Save the new dataset with emotions
df_test_financial_news_with_emotions.to_parquet('stock_tweets_with_emotions_test_data.parquet', index=False)
print(f"\n Dataset saved: stock_tweets_with_emotions_test_data.parquet")
print(f"  Rows: {len(df_test_financial_news_with_emotions)}, Columns: {len(df_test_financial_news_with_emotions.columns)}")
print("=" * 70)

# Final GPU cleanup
if device == 0:
    torch.cuda.empty_cache()
gc.collect()


In [None]:
df_test_financial_news_with_emotions_cleaned = pd.read_parquet("/content/stock_tweets_with_emotions_test_data.parquet")
print(df_test_financial_news_with_emotions_cleaned.shape)

# Use a lambda to format only valid dates and leave others untouched
def safe_format(x):
    try:
        # Convert to datetime and then to your specific string format
        return pd.to_datetime(x).strftime('%d/%m/%Y')
    except:
        return x

df_test_financial_news_with_emotions_cleaned['date'] = df_test_financial_news_with_emotions_cleaned['date'].apply(safe_format)
df_test_financial_news_with_emotions_cleaned.columns = df_test_financial_news_with_emotions_cleaned.columns.str.title()
df_test_financial_news_with_emotions_cleaned.head()

In [None]:
df_ohclv_test = pd.read_csv("/content/ohlcv_data.csv")
df_ohclv_test['Date'] = pd.to_datetime(df_ohclv_test['Date']).dt.strftime('%d/%m/%Y')
df_ohclv_test.head()

In [None]:
# Merge the datasets on date parameter

merged_test_df = pd.merge(
    df_test_financial_news_with_emotions_cleaned,
    df_ohclv_test,
    on=['Date','Stock Name'],
    how='inner'
)

# Display top 5 in the merged dataset
merged_test_df.head()

In [None]:
merged_test_df.shape

In [None]:
merged_test_df.columns

In [None]:
merged_test_df.isnull().sum()