In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [2]:
"""pip install -U scikit-learn"""

'pip install -U scikit-learn'

In [2]:
# Load the train and test data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

In [3]:
# Load tweets and labels
train_texts = load_data('train_text.txt')
train_labels = np.loadtxt('train_labels.txt', dtype=int)
val_texts = load_data('val_text.txt')
val_labels = np.loadtxt('val_labels.txt', dtype=int)
test_texts = load_data('test_text.txt')

In [4]:
# Text preprocessing and vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [5]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

In [6]:
# Predict on validation set
val_preds = model.predict(X_val)

# Compute and print the classification report
print("Validation Set Performance:\n", classification_report(val_labels, val_preds, target_names=['Anger', 'Joy', 'Sadness', 'Optimism']))

Validation Set Performance:
               precision    recall  f1-score   support

       Anger       0.61      0.88      0.72       160
         Joy       0.75      0.46      0.57        97
     Sadness       0.62      0.18      0.28        28
    Optimism       0.64      0.55      0.59        89

    accuracy                           0.64       374
   macro avg       0.66      0.52      0.54       374
weighted avg       0.66      0.64      0.62       374



In [7]:
# Compute macro-averaged F1 score for validation set
macro_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Macro-Averaged F1 Score: 0.5407


In [8]:
# Predict labels for test set
test_preds = model.predict(X_test)

In [9]:
# Save test set predictions to a file
np.savetxt('test_preds.txt', test_preds, fmt='%d')

## Text Preprocessing

In [15]:
#%pip install spacy

Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/03/56/dce58155b3bce42f987dbf6cc23e820e037bc02abc99ade6ae3ad8d619a9/spacy-3.8.2-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.8.2-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Obtaining dependency information for spacy-legacy<3.1.0,>=3.0.11 from https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collectin

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
tables 3.8.0 requires cython>=0.29.21, which is not installed.
transformers 2.1.1 requires sentencepiece, which is not installed.
numba 0.57.0 requires numpy<1.25,>=1.21, but you have numpy 2.0.2 which is incompatible.
pandas 2.2.0 requires numpy<2,>=1.23.2; python_version == "3.11", but you have numpy 2.0.2 which is incompatible.
scikit-learn 1.4.1.post1 requires numpy<2.0,>=1.19.5, but you have numpy 2.0.2 which is incompatible.
scipy 1.10.1 requires numpy<1.27.0,>=1.19.5, but you have numpy 2.0.2 which is incompatible.


In [17]:
#pip install blis

Note: you may need to restart the kernel to use updated packages.


In [10]:
import nltk
import emoji
import spacy
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [12]:
#pip install nltk emoji spacy

In [11]:
# Download necessary NLTK data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Download the spaCy model
#!python -m spacy download en_core_web_sm

In [None]:
#pip install nltk snowballstemmer

In [12]:
# Initialize necessary components
nlp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load necessary resources
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
def preprocess_text(text):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Tokenization and stop words removal
    tokens = [token for token in nltk.word_tokenize(text) if token.isalpha() and token.lower() not in stop_words]
    
    """# Stemming
    tokens = [stemmer.stem(token) for token in tokens]
    """
    # Lemmatization
    tokens = [nlp(token)[0].lemma_ for token in tokens]
    
    return ' '.join(tokens)

In [62]:
# Preprocess the text data
train_texts = [preprocess_text(text) for text in train_texts]
val_texts = [preprocess_text(text) for text in val_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [63]:
train_texts[19]

'user user user tamra would f swung piec needstobeadmit bulli'

In [64]:
# Text vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [65]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

In [66]:
# Predict on validation set
val_preds = model.predict(X_val)

In [67]:
# Compute and print the classification report
print("Validation Set Performance:\n", classification_report(val_labels, val_preds, target_names=['Anger', 'Joy', 'Sadness', 'Optimism']))

Validation Set Performance:
               precision    recall  f1-score   support

       Anger       0.65      0.91      0.76       160
         Joy       0.76      0.49      0.60        97
     Sadness       0.50      0.14      0.22        28
    Optimism       0.68      0.60      0.63        89

    accuracy                           0.67       374
   macro avg       0.65      0.54      0.55       374
weighted avg       0.67      0.67      0.65       374



In [68]:
# Compute macro-averaged F1 score for validation set
macro_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Macro-Averaged F1 Score: 0.5538


In [69]:
# Predict labels for test set
test_preds = model.predict(X_test)

In [70]:
# Save the predictions to CSV
predictions_df = pd.DataFrame(test_preds, columns=['Labels'])
predictions_df.to_csv('test_predictions.csv', index=False)

### Adding emotion specific features

In [81]:
# Load the NRC Emotion Lexicon
def load_nrc_lexicon(lexicon_path):
    lexicon = pd.read_csv(lexicon_path, sep='\t', header=None)
    lexicon.columns = ['word', 'emotion', 'association']
    # Convert 'association' to numeric, assuming '1' for presence and '0' for absence
    lexicon['association'] = lexicon['association'].astype(int)
    # Filter for the four specific emotions
    return lexicon[lexicon['emotion'].isin(['anger', 'joy', 'optimism', 'sadness'])]

In [84]:
# Emotion-specific feature extraction
def extract_emotion_features(text, lexicon):
    tokens = set(nltk.word_tokenize(text.lower()))
    features = {}
    for emotion in lexicon['emotion'].unique():
        # Count presence of emotion words in the text
        features[emotion] = sum(lexicon[(lexicon['emotion'] == emotion) & (lexicon['word'].isin(tokens))]['association'])
    return pd.Series(features)

In [82]:
# Load emotion lexicon (update the path to your lexicon file)
nrc_lexicon = load_nrc_lexicon('NRC-Emotion-Lexicon-Wordlevel.txt')

In [83]:
nrc_lexicon.head()

Unnamed: 0,word,emotion,association
0,aback,anger,0
4,aback,joy,0
7,aback,sadness,0
10,abacus,anger,0
14,abacus,joy,0


In [85]:
# Extract emotion features
train_emotion_features = pd.DataFrame([extract_emotion_features(text, nrc_lexicon) for text in train_texts])
val_emotion_features = pd.DataFrame([extract_emotion_features(text, nrc_lexicon) for text in val_texts])
test_emotion_features = pd.DataFrame([extract_emotion_features(text, nrc_lexicon) for text in test_texts])

In [86]:
train_emotion_features.head()

Unnamed: 0,anger,joy,sadness
0,0,0,1
1,0,0,0
2,0,0,0
3,0,0,0
4,1,0,1


In [87]:
# Text vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_val_tfidf = vectorizer.transform(val_texts)
X_test_tfidf = vectorizer.transform(test_texts)

In [88]:
# Combine TF-IDF features and emotion-specific features
X_train = np.hstack((X_train_tfidf.toarray(), train_emotion_features))
X_val = np.hstack((X_val_tfidf.toarray(), val_emotion_features))
X_test = np.hstack((X_test_tfidf.toarray(), test_emotion_features))

In [89]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

In [90]:
# Predict on validation set
val_preds = model.predict(X_val)

# Compute and print the classification report
print("Validation Set Performance:\n", classification_report(val_labels, val_preds, target_names=['Anger', 'Joy', 'Sadness', 'Optimism']))

Validation Set Performance:
               precision    recall  f1-score   support

       Anger       0.66      0.89      0.76       160
         Joy       0.73      0.54      0.62        97
     Sadness       0.43      0.11      0.17        28
    Optimism       0.64      0.58      0.61        89

    accuracy                           0.67       374
   macro avg       0.62      0.53      0.54       374
weighted avg       0.66      0.67      0.64       374



In [91]:
# Compute macro-averaged F1 score for validation set
macro_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Macro-Averaged F1 Score: 0.5399


In [92]:
# Predict labels for test set
test_preds = model.predict(X_test)

# Save the predictions to CSV
predictions_df = pd.DataFrame(test_preds, columns=['Predictions'])
predictions_df.to_csv('test_predictions_emo.csv', index=False)

### Data augmentation for sadness

In [97]:
from nltk.corpus import wordnet
import random
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aser\AppData\Roaming\nltk_data...


True

In [164]:
# Data Augmentation for optimism Class
def augment_data(texts, labels, emotion_to_augment='sadness', augmentation_count=50):
    augmented_texts = []
    augmented_labels = []
    sadness_examples = [text for text, label in zip(texts, labels) if label == 2]  

    for _ in range(augmentation_count):
        original_text = random.choice(sadness_examples)
        words = nltk.word_tokenize(original_text)
        new_words = words.copy()

        for i, word in enumerate(words):
            # Get synonyms using WordNet
            synonyms = set(wordnet.synsets(word))
            if synonyms:
                synonym_list = [lemma.name() for syn in synonyms for lemma in syn.lemmas()]
                if synonym_list:
                    new_word = random.choice(synonym_list)
                    new_words[i] = new_word

        augmented_texts.append(' '.join(new_words))
        augmented_labels.append(2) 

    return augmented_texts, augmented_labels


In [186]:
# Load tweets and labels
train_texts = load_data('train_text.txt')
train_labels = np.loadtxt('train_labels.txt', dtype=int)
val_texts = load_data('val_text.txt')
val_labels = np.loadtxt('val_labels.txt', dtype=int)
test_texts = load_data('test_text.txt')


In [187]:
# Perform augmentation
augmented_texts, augmented_labels = augment_data(train_texts, train_labels)
train_texts.extend(augmented_texts)
train_labels = np.concatenate([train_labels, augmented_labels])

In [188]:
# Preprocess the text data
train_texts_proc = [preprocess_text(text) for text in train_texts]
val_texts_proc = [preprocess_text(text) for text in val_texts]
test_texts_proc = [preprocess_text(text) for text in test_texts]

In [189]:
# Text vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts_proc)
X_val = vectorizer.transform(val_texts_proc)
X_test = vectorizer.transform(test_texts_proc)

In [140]:
param_distributions = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

In [141]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [169]:
random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=1000), 
    param_distributions, 
    n_iter=10,  # Number of parameter settings to sample
    cv=5, 
    scoring='f1_macro',
    random_state=42
)
random_search.fit(X_train, train_labels)

best_model = random_search.best_estimator_
print(f"Best parameters: {random_search.best_params_}")

Best parameters: {'solver': 'liblinear', 'C': 10}


In [170]:

# Compute and print the classification re# Predict on validation set
val_preds = best_model.predict(X_val)
print("Validation Set Performance:\n", classification_report(val_labels, val_preds, target_names=['Anger', 'Joy', 'Sadness', 'Optimism']))


Validation Set Performance:
               precision    recall  f1-score   support

       Anger       0.72      0.86      0.78       160
         Joy       0.75      0.57      0.65        97
     Sadness       0.32      0.21      0.26        28
    Optimism       0.61      0.61      0.61        89

    accuracy                           0.68       374
   macro avg       0.60      0.56      0.57       374
weighted avg       0.67      0.68      0.67       374



In [171]:
# Compute macro-averaged F1 score for validation set
macro_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Macro-Averaged F1 Score: 0.5727


In [172]:
# Predict labels for test set
test_preds = best_model.predict(X_test)

# Save the predictions to CSV
predictions_df = pd.DataFrame(test_preds, columns=['Predictions'])
predictions_df.to_csv('test_predictions_aug_grid.csv', index=False)

### just some analysis

In [173]:
# Load tweets and labels
train_texts = load_data('train_text.txt')
train_labels = np.loadtxt('train_labels.txt', dtype=int)
val_texts = load_data('val_text.txt')
val_labels = np.loadtxt('val_labels.txt', dtype=int)
test_texts = load_data('test_text.txt')

In [176]:
unique, counts = np.unique(train_labels, return_counts=True)
counts

array([1400,  708,  294,  855])

# adjusting class weights

In [177]:
from collections import Counter

In [191]:
# Calculate class weights manually
class_counts = Counter(train_labels)
total_samples = len(train_labels)
class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}


In [192]:
class_weights

{np.int64(2): 2.403343023255814,
 np.int64(0): 0.5905357142857143,
 np.int64(1): 1.1677259887005649,
 np.int64(3): 0.9669590643274854}

In [193]:
# Assign sample weights for each instance
sample_weights = np.array([class_weights[label] for label in train_labels])


In [194]:
sample_weights

array([2.40334302, 0.59053571, 1.16772599, ..., 2.40334302, 2.40334302,
       2.40334302])

In [182]:
# Install xgboost package

%pip install xgboost
from xgboost import XGBClassifier

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
    --------------------------------------- 2.1/124.9 MB 16.7 MB/s eta 0:00:08
   - -------------------------------------- 5.8/124.9 MB 14.7 MB/s eta 0:00:09
   -- ------------------------------------- 8.4/124.9 MB 14.1 MB/s eta 0:00:09
   --- ------------------------------------ 11.3/124.9 MB 13.5 MB/s eta 0:00:09
   ---- ----------------------------------- 14.2/124.9 MB 13.7 MB/s eta 0:00:09
   ----- ---------------------------------- 17.0/124.9 MB 13.6 MB/s eta 0:00:08
   ------ --------------------------------- 19.9/124.9 MB 13.7 MB/s eta 0:00:08
   ------- -------------------------------- 22.5/124.9 MB 13.6 MB/s eta 0:00:08
   -------- ------------------------------- 25.4/124.9 MB 13.6 MB/s eta 0:00:08
   -------- ------------------------------- 28.0/124.9 MB 13.6 M

In [195]:
# Train an XGBoost classifier with the sample weights

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_model.fit(X_train, train_labels, sample_weight=sample_weights)

In [197]:
# Predict on validation set
val_preds = xgb_model.predict(X_val)

# Compute and print the classification report
print("Validation Set Performance:\n", classification_report(val_labels, val_preds, target_names=['Anger', 'Joy', 'Sadness', 'Optimism']))

Validation Set Performance:
               precision    recall  f1-score   support

       Anger       0.73      0.61      0.67       160
         Joy       0.69      0.47      0.56        97
     Sadness       0.16      0.61      0.25        28
    Optimism       0.67      0.51      0.58        89

    accuracy                           0.55       374
   macro avg       0.56      0.55      0.51       374
weighted avg       0.66      0.55      0.59       374



In [198]:
# Compute macro-averaged F1 score for validation set
macro_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Macro-Averaged F1 Score: 0.5146
