In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import wordcloud
import matplotlib.pyplot as plt

from nltk.corpus import stopwords, opinion_lexicon
from nltk.stem import WordNetLemmatizer

# from spellchecker import SpellChecker
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from tqdm import tqdm
tqdm.pandas()

import os
for dirname, _, filenames in os.walk('aalto-snlp-course-competition-2024'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import csv
df_train = pd.read_csv('./aalto-snlp-course-competition-2024/train_2024.csv',quoting=csv.QUOTE_NONE)
df_dev = pd.read_csv('./aalto-snlp-course-competition-2024/dev_2024.csv',quoting=csv.QUOTE_NONE)
df_test = pd.read_csv('./aalto-snlp-course-competition-2024/test_2024.csv',sep=',',encoding='utf-8',quoting=csv.QUOTE_NONE)

In [None]:
df_train['label'].value_counts()

In [None]:
import re
import string
lemmatizer = WordNetLemmatizer()
POSITIVE_WORDS = set(opinion_lexicon.positive())
NEGATIVE_WORDS = set(opinion_lexicon.negative())
# spell = SpellChecker()
def clean_text(text, set_lower=True):
    '''Make text lowercase, remove text in square brackets,remove links, remove html tags,remove punctuation
    and remove numbers.'''
    new_text = text
    if set_lower:
        new_text = new_text.lower()
    # remove text in square brackets
    new_text = re.sub('\[.*?\]', '', new_text)
    # remove links
    new_text = re.sub('https?://\S+|www\.\S+', '', new_text)
    # remove html tags
    new_text = re.sub('<.*?>', '', new_text)
    # remove punctuation
    new_text = re.sub('[%s]' % re.escape(string.punctuation), '', new_text)

    # Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
    new_text = emoji_pattern.sub(r'', new_text)
    new_text = re.sub('\n', '', new_text)
    new_text = re.sub('\w*\d\w*', '', new_text)
    new_text = re.sub('\d*', '', new_text)

    stop_words = set(stopwords.words('english'))
    tokens = []
    for word in new_text.split():
        if word not in stop_words:
            tokens.append(lemmatizer.lemmatize(word))
    new_text = ' '.join(tokens)
    return new_text

def extract_emoticons(text):
    '''Extract the last token
    '''
    new_text = clean_text(text, set_lower=False)
    words = new_text.split()
    return words[-1] if words else ''

def count_all_capitalized(text):
    new_text = clean_text(text, set_lower=False)
    return len([word for word in new_text.split() if word.isupper()])

def count_positive(text):
    new_text = clean_text(text, set_lower=False)
    return len([word for word in new_text.split() if word in POSITIVE_WORDS])

def count_negative(text):
    new_text = clean_text(text, set_lower=False)
    return len([word for word in new_text.split() if word in NEGATIVE_WORDS])
        

In [None]:
df_train['text_clean'] = df_train['text'].apply(clean_text)
df_dev['text_clean'] = df_dev['text'].apply(clean_text)
df_test['text_clean'] = df_test['text'].apply(clean_text)

# df_train['emoticons'] = df_train['text'].apply(extract_emoticons)
# df_dev['emoticons'] = df_dev['text'].apply(extract_emoticons)
# df_test['emoticons'] = df_test['text'].apply(extract_emoticons)

# df_train['num_all_capitalized'] = df_train['text'].apply(count_all_capitalized)
# df_dev['num_all_capitalized'] = df_dev['text'].apply(count_all_capitalized)
# df_test['num_all_capitalized'] = df_test['text'].apply(count_all_capitalized)

In [None]:
df_train

## Define models

In [None]:
import joblib
import scipy

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn import svm
from sklearn.metrics import f1_score, confusion_matrix, classification_report
preprocessing_pipe = joblib.load('tfidf.pkl')
x_train, y_train = scipy.sparse.load_npz('output/x_train_smote.npz'), scipy.sparse.load_npz('output/y_train_smote.npz')
x_train, y_train = x_train.toarray(), y_train.toarray()[0]
x_dev, y_dev = preprocessing_pipe.transform(df_dev['text_clean']), df_dev['label']

In [None]:
estimators = [
    # ("Logistic Regression", LogisticRegression()),
    ("Naive Bayes", MultinomialNB()),
    ("Random Forest", RandomForestClassifier()),
    # ("Ada Boost", AdaBoostClassifier()),
    # ("Gradient Boost", GradientBoostingClassifier()),
    ("XGBoost", XGBClassifier()),
    # ("LightGBM", LGBMClassifier()),
    # ("CatBoost", CatBoostClassifier()),
    # ("SVM", svm.SVC())
]

stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), n_jobs=4)
stacking_classifier

## Train

In [None]:
model = MultinomialNB()

In [None]:
# Fit the pipeline with the data
model.fit(x_train, y_train)

In [None]:
y_pred_class = model.predict(x_train)

print(f1_score(y_train, y_pred_class))

print(confusion_matrix(y_train, y_pred_class))
print(classification_report(y_train, y_pred_class))

In [None]:
y_pred_class = model.predict(x_dev)

print(f1_score(y_dev, y_pred_class))

print(confusion_matrix(y_dev, y_pred_class))
print(classification_report(y_dev, y_pred_class))

In [None]:

# save
joblib.dump(model, "model-nb.pkl") 

# load
# model = joblib.load("model.pkl")

## Submission

In [None]:
df_test['label'] = model.predict(df_test['text_clean'])

In [None]:
submission = df_test[['label']]
submission.to_csv('./submission.csv')

In [None]:
submission.shape