In [1]:
# Stock Sentiment Analysis - Text Mining Project

## Project Structure:
# 1. Data Import and Exploration
# 2. Data Preprocessing
# 3. Feature Engineering
# 4. Model Training and Evaluation
# 5. Prediction on Test Set
# 6. Conclusions and Future Work

## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import gensim.downloader as api
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import transformers
from transformers import AutoTokenizer, AutoModel

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set up matplotlib for better visualizations
plt.style.use('ggplot')
sns.set(style='whitegrid')

## 1. Data Import and Exploration

# Load the training data
train_data = pd.read_csv('Project Data-20250507/train.csv')

# Display basic information about the dataset
print("Training Data Shape:", train_data.shape)
train_data.head()

# Check for missing values
print("\nMissing Values in Training Data:")
print(train_data.isnull().sum())

# Label distribution
print("\nLabel Distribution:")
label_counts = train_data['label'].value_counts().sort_index()
print(label_counts)

# Visualize label distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='label', data=train_data, palette='viridis')
plt.title('Label Distribution in Training Data')
plt.xlabel('Sentiment Label (0: Bearish, 1: Bullish, 2: Neutral)')
plt.ylabel('Count')

# Add count labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'bottom',
                xytext = (0, 5), textcoords = 'offset points')
plt.tight_layout()
plt.show()

# Text length analysis
train_data['text_length'] = train_data['text'].apply(len)
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))

# Visualize text length distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(train_data['text_length'], kde=True, bins=50)
plt.title('Distribution of Text Length (Characters)')
plt.xlabel('Number of Characters')

plt.subplot(1, 2, 2)
sns.histplot(train_data['word_count'], kde=True, bins=50)
plt.title('Distribution of Word Count')
plt.xlabel('Number of Words')
plt.tight_layout()
plt.show()

# Visualize text length by sentiment
plt.figure(figsize=(12, 5))
sns.boxplot(x='label', y='word_count', data=train_data, palette='viridis')
plt.title('Word Count by Sentiment Label')
plt.xlabel('Sentiment Label (0: Bearish, 1: Bullish, 2: Neutral)')
plt.ylabel('Word Count')
plt.tight_layout()
plt.show()

# Function to extract most common words
def get_most_common_words(text_series, n=20):
    all_words = ' '.join(text_series).lower()
    all_words = re.sub(r'[^\w\s]', '', all_words)
    word_tokens = word_tokenize(all_words)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in word_tokens if word not in stop_words and len(word) > 2]
    return Counter(filtered_words).most_common(n)

# Get most common words overall
most_common_words = get_most_common_words(train_data['text'])
words, counts = zip(*most_common_words)

plt.figure(figsize=(12, 6))
sns.barplot(x=list(words), y=list(counts), palette='viridis')
plt.title('Most Common Words in Tweets')
plt.xlabel('Words')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Word clouds for different sentiment labels
def create_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                         max_words=100, contour_width=3, contour_color='steelblue').generate(text)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.tight_layout()
    plt.show()

# Word cloud for each sentiment category
for label, sentiment in zip([0, 1, 2], ['Bearish', 'Bullish', 'Neutral']):
    text = ' '.join(train_data[train_data['label'] == label]['text'])
    create_wordcloud(text, f'Word Cloud for {sentiment} Tweets')

## 2. Data Preprocessing

# Define preprocessing functions
def clean_text(text):
    """
    Clean the text by:
    1. Converting to lowercase
    2. Removing URLs
    3. Removing user mentions (@user)
    4. Removing hashtags
    5. Removing non-alphanumeric characters
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags (keeping the content without #)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove non-alphanumeric characters (keeping spaces)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def remove_stopwords(text, stop_words=None):
    """Remove stopwords from text"""
    if stop_words is None:
        stop_words = set(stopwords.words('english'))
    
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    """Apply stemming to text"""
    stemmer = SnowballStemmer('english')
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def lemmatize_text(text):
    """Apply lemmatization to text"""
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized_text)

# Apply preprocessing to the training data
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
train_data['text_no_stopwords'] = train_data['cleaned_text'].apply(remove_stopwords)
train_data['stemmed_text'] = train_data['text_no_stopwords'].apply(stem_text)
train_data['lemmatized_text'] = train_data['text_no_stopwords'].apply(lemmatize_text)

# Display a random sample to check preprocessing effectiveness
sample_indices = random.sample(range(len(train_data)), 3)
for idx in sample_indices:
    print(f"Original: {train_data.loc[idx, 'text']}")
    print(f"Cleaned: {train_data.loc[idx, 'cleaned_text']}")
    print(f"No Stopwords: {train_data.loc[idx, 'text_no_stopwords']}")
    print(f"Stemmed: {train_data.loc[idx, 'stemmed_text']}")
    print(f"Lemmatized: {train_data.loc[idx, 'lemmatized_text']}")
    print("-" * 100)

# Split the data into training and validation sets
X = train_data['lemmatized_text']  # We'll use lemmatized text for now
y = train_data['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the split
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Training label distribution: {y_train.value_counts().sort_index().tolist()}")
print(f"Validation label distribution: {y_val.value_counts().sort_index().tolist()}")

## 3. Feature Engineering

# 3.1 Bag of Words (BoW)
bow_vectorizer = CountVectorizer(max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_val_bow = bow_vectorizer.transform(X_val)

print(f"BoW vocabulary size: {len(bow_vectorizer.vocabulary_)}")
print(f"BoW feature matrix shape: {X_train_bow.shape}")

# 3.2 TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

print(f"TF-IDF vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")

# 3.3 Word2Vec
# Train Word2Vec model on our corpus
def get_tokenized_text(text_series):
    return [word_tokenize(text.lower()) for text in text_series]

tokenized_texts = get_tokenized_text(train_data['cleaned_text'])

# Train the model
w2v_model = Word2Vec(sentences=tokenized_texts, 
                    vector_size=100,
                    window=5,
                    min_count=2,
                    workers=4,
                    sg=1)  # sg=1 for skip-gram, sg=0 for CBOW

# Function to create document embeddings from Word2Vec
def get_document_vector(text, model, vector_size=100):
    words = word_tokenize(text.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    
    # Average word vectors to get document vector
    return np.mean(word_vectors, axis=0)

# Create document embeddings for train and validation sets
X_train_w2v = np.array([get_document_vector(text, w2v_model) for text in X_train])
X_val_w2v = np.array([get_document_vector(text, w2v_model) for text in X_val])

print(f"Word2Vec feature matrix shape: {X_train_w2v.shape}")

# 3.4 Pretrained GloVe embeddings
# Load pretrained GloVe embeddings
try:
    glove_model = api.load('glove-twitter-25')  # 25-dimensional embeddings
    
    # Function to create document embeddings from GloVe
    def get_glove_vector(text, model, vector_size=25):
        words = word_tokenize(text.lower())
        word_vectors = [model[word] for word in words if word in model]
        
        if len(word_vectors) == 0:
            return np.zeros(vector_size)
        
        # Average word vectors to get document vector
        return np.mean(word_vectors, axis=0)
    
    # Create document embeddings for train and validation sets
    X_train_glove = np.array([get_glove_vector(text, glove_model) for text in X_train])
    X_val_glove = np.array([get_glove_vector(text, glove_model) for text in X_val])
    
    print(f"GloVe feature matrix shape: {X_train_glove.shape}")
except:
    print("Could not load GloVe embeddings. Please ensure gensim is installed and internet is available.")
    X_train_glove, X_val_glove = None, None

## 4. Model Training and Evaluation

# 4.1 KNN with TF-IDF
def train_and_evaluate_knn(X_train, X_val, y_train, y_val, feature_type):
    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    
    # Predict
    y_pred = knn.predict(X_val)
    
    # Evaluate
    print(f"KNN with {feature_type} - Evaluation Metrics:")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print(f"Weighted Precision: {precision_score(y_val, y_pred, average='weighted'):.4f}")
    print(f"Weighted Recall: {recall_score(y_val, y_pred, average='weighted'):.4f}")
    print(f"Weighted F1-score: {f1_score(y_val, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Bearish', 'Bullish', 'Neutral'],
               yticklabels=['Bearish', 'Bullish', 'Neutral'])
    plt.title(f'Confusion Matrix - KNN with {feature_type}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()
    
    return knn

# Train and evaluate KNN with TF-IDF features
knn_tfidf = train_and_evaluate_knn(X_train_tfidf, X_val_tfidf, y_train, y_val, "TF-IDF")

# Train and evaluate KNN with Word2Vec features
knn_w2v = train_and_evaluate_knn(X_train_w2v, X_val_w2v, y_train, y_val, "Word2Vec")

# 4.2 LSTM with Embeddings
def train_and_evaluate_lstm(X_train, X_val, y_train, y_val):
    # Tokenize text
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X_train)
    
    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq = tokenizer.texts_to_sequences(X_val)
    
    # Pad sequences
    max_seq_length = 50
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
    X_val_pad = pad_sequences(X_val_seq, maxlen=max_seq_length)
    
    # One-hot encode labels
    y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=3)
    y_val_cat = tf.keras.utils.to_categorical(y_val, num_classes=3)
    
    # Build LSTM model
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_seq_length))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Train model
    history = model.fit(
        X_train_pad, y_train_cat,
        epochs=5,
        batch_size=64,
        validation_data=(X_val_pad, y_val_cat),
        verbose=1
    )
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.tight_layout()
    plt.show()
    
    # Evaluate model
    y_pred_prob = model.predict(X_val_pad)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    print("LSTM Model - Evaluation Metrics:")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print(f"Weighted Precision: {precision_score(y_val, y_pred, average='weighted'):.4f}")
    print(f"Weighted Recall: {recall_score(y_val, y_pred, average='weighted'):.4f}")
    print(f"Weighted F1-score: {f1_score(y_val, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Bearish', 'Bullish', 'Neutral'],
               yticklabels=['Bearish', 'Bullish', 'Neutral'])
    plt.title('Confusion Matrix - LSTM Model')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()
    
    return model, tokenizer

# Train and evaluate LSTM model
try:
    lstm_model, tokenizer = train_and_evaluate_lstm(X_train, X_val, y_train, y_val)
except Exception as e:
    print(f"Error training LSTM model: {e}")
    lstm_model, tokenizer = None, None

# 4.3 Transformer-based approach (DistilBERT)
def train_and_evaluate_transformer(X_train, X_val, y_train, y_val):
    try:
        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        
        # Tokenize data
        def tokenize_data(texts, max_length=128):
            return tokenizer(
                texts.tolist(),
                padding='max_length',
                truncation=True,
                max_length=max_length,
                return_tensors='tf'
            )
        
        # Sample subset for demonstration (full training would require more resources)
        sample_size = 500
        indices = np.random.choice(len(X_train), size=sample_size, replace=False)
        X_train_sample = X_train.iloc[indices]
        y_train_sample = y_train.iloc[indices]
        
        val_indices = np.random.choice(len(X_val), size=100, replace=False)
        X_val_sample = X_val.iloc[val_indices]
        y_val_sample = y_val.iloc[val_indices]
        
        # Tokenize data
        train_encodings = tokenize_data(X_train_sample)
        val_encodings = tokenize_data(X_val_sample)
        
        # Convert labels to categorical
        y_train_cat = tf.keras.utils.to_categorical(y_train_sample, num_classes=3)
        y_val_cat = tf.keras.utils.to_categorical(y_val_sample, num_classes=3)
        
        # Create TensorFlow datasets
        train_dataset = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings),
            y_train_cat
        )).batch(16)
        
        val_dataset = tf.data.Dataset.from_tensor_slices((
            dict(val_encodings),
            y_val_cat
        )).batch(16)
        
        # Load pretrained model
        from transformers import TFAutoModelForSequenceClassification
        
        model = TFAutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", 
            num_labels=3
        )
        
        # Compile the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy')]
        
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        
        # Train the model
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=2  # More epochs would be better but this is for demonstration
        )
        
        # Make predictions
        val_logits = model.predict(dict(val_encodings)).logits
        val_predictions = tf.argmax(val_logits, axis=1).numpy()
        
        # Evaluate
        print("Transformer Model - Evaluation Metrics:")
        print(f"Accuracy: {accuracy_score(y_val_sample, val_predictions):.4f}")
        print(f"Weighted Precision: {precision_score(y_val_sample, val_predictions, average='weighted'):.4f}")
        print(f"Weighted Recall: {recall_score(y_val_sample, val_predictions, average='weighted'):.4f}")
        print(f"Weighted F1-score: {f1_score(y_val_sample, val_predictions, average='weighted'):.4f}")
        print("\nClassification Report:")
        print(classification_report(y_val_sample, val_predictions))
        
        return model, tokenizer
    
    except Exception as e:
        print(f"Error training transformer model: {e}")
        print("The transformer approach may require additional setup or GPU resources.")
        return None, None

# Try to train transformer model (may not work in all environments)
try:
    transformer_model, transformer_tokenizer = train_and_evaluate_transformer(X_train, X_val, y_train, y_val)
except Exception as e:
    print(f"Could not train transformer model: {e}")
    transformer_model, transformer_tokenizer = None, None

## 5. Prediction on Test Set

# Load test data
test_data = pd.read_csv('test.csv')
print("Test Data Shape:", test_data.shape)
test_data.head()

# Preprocess test data with the same steps as training data
test_data['cleaned_text'] = test_data['text'].apply(clean_text)
test_data['text_no_stopwords'] = test_data['cleaned_text'].apply(remove_stopwords)
test_data['stemmed_text'] = test_data['text_no_stopwords'].apply(stem_text)
test_data['lemmatized_text'] = test_data['text_no_stopwords'].apply(lemmatize_text)

# Choose the best performing model for final predictions
# For this example, we'll use the KNN with TF-IDF features

# Transform test data
X_test_tfidf = tfidf_vectorizer.transform(test_data['lemmatized_text'])

# Make predictions
test_predictions = knn_tfidf.predict(X_test_tfidf)

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'label': test_predictions
})

# Save predictions
submission.to_csv('pred_XX.csv', index=False)
print("Predictions saved to pred_XX.csv")

## 6. Conclusions and Future Work

"""
Model Comparison:
- TF-IDF with KNN performed [describe performance]
- Word2Vec with KNN performed [describe performance]
- LSTM model performed [describe performance]
- Transformer model performed [describe performance if available]

Best Model: [Identify which model performed best]

Future Improvements:
1. Fine-tune hyperparameters using Grid Search or Random Search
2. Experiment with different preprocessing techniques
3. Try other transformer models like BERT or RoBERTa
4. Implement ensemble methods combining multiple models
5. Address class imbalance issues (if present)
6. Augment training data
7. Feature engineer additional features from the tweets (e.g., sentiment scores, entity recognition)
8. Experiment with more advanced deep learning architectures
"""

# Display overview of experiments
models = ['KNN (TF-IDF)', 'KNN (Word2Vec)', 'LSTM', 'Transformer']
accuracies = [0.0, 0.0, 0.0, 0.0]  # Replace with actual values after running
f1_scores = [0.0, 0.0, 0.0, 0.0]   # Replace with actual values after running

plt.figure(figsize=(12, 6))
x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, accuracies, width, label='Accuracy')
plt.bar(x + width/2, f1_scores, width, label='F1-Score')

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, models)
plt.legend()
plt.ylim(0, 1.0)

for i, v in enumerate(accuracies):
    plt.text(i - width/2, v + 0.01, f'{v:.2f}', ha='center')

for i, v in enumerate(f1_scores):
    plt.text(i + width/2, v + 0.01, f'{v:.2f}', ha='center')

plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'nltk'