In [None]:
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report
import random
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm

# Custom imports
from helper_functions import evaluate_clf
from EDAprep import SynonymReplacementTransformer, RandomInsertionTransformer, RandomSwapTransformer, RandomDeletionTransformer

In [None]:
DATA_PATH = "lai-data/political_leaning.csv"
FEATURE = "political_leaning"

In [None]:
df_politics = pd.read_csv(DATA_PATH).iloc[:3000] # remove iloc to test full dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_politics['post'], df_politics[FEATURE],test_size= 0.3)

In [None]:
nlp = spacy.load("en_core_web_sm")

## 1. BoW SVC model

In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('svc', LinearSVC())
])

In [None]:
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## 2. BoW SVC with EDA
### 2.1 Data augmentation

In [None]:
# p - % of the sentence to be augmented (good values - 1%, 2%, 5%, 10%, 20%)

synonym_replacement_transformer = SynonymReplacementTransformer(p=0.2)
random_insertion_transformer = RandomInsertionTransformer(p=0.2)
random_swap_transformer = RandomSwapTransformer(p=0.2)
random_deletion_transformer = RandomDeletionTransformer(p=0.2)

In [None]:
stop_words = set(stopwords.words('english'))

n_aug = 4 # The number of augmented instances of a sentence (good values - 2, 4, 8, 16)

X_train_augmented = []
y_train_augmented = []

for sentence, label in tqdm(zip(X_train, y_train)):
    # augmented_sentences = [sentence]  # Keep the original sentence
    augmented_sentences = []
    sentence = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

    for _ in range(n_aug):
        chosen_operation = random.choice(['SR', 'RI' 'RS', 'RD'])  

        if chosen_operation == 'SR':
            augmented_sentences.append(synonym_replacement_transformer.transform([sentence])[0])
        elif chosen_operation == 'RI':
            # augmented_sentences.append(random_insertion_transformer.transform([sentence])[0]) # Insertions take too long
            augmented_sentences.append(sentence)
        elif chosen_operation == 'RS':
            augmented_sentences.append(random_swap_transformer.transform([sentence])[0])
        elif chosen_operation == 'RD':
            augmented_sentences.append(random_deletion_transformer.transform([sentence])[0])

    X_train_augmented.extend(augmented_sentences)
    y_train_augmented.extend([label] * len(augmented_sentences))


### 2.2 BoW SVC with EDA model training and evaluation

In [None]:
# Your pipeline with data augmentation
clfEDA = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('svc', LinearSVC())
])

# Fit the model with augmented data
clfEDA.fit(X_train_augmented, y_train_augmented)

In [None]:
y_pred = clfEDA.predict(X_test)
print(classification_report(y_test, y_pred))