In [1]:
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.corpus import wordnet
import random
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm
import nltk

  from tqdm.autonotebook import tqdm


In [None]:
DATA_PATH = "lai-data/political_leaning.csv"
FEATURE = "political_leaning"

In [2]:
df_politics = pd.read_csv(DATA_PATH).iloc[:1000] # remove iloc to test full dataset

In [4]:
df_politics

Unnamed: 0,auhtor_ID,post,political_leaning
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right
4,t2_7ramzeng,This article's intention is clear that they wa...,right
...,...,...,...
995,t2_4vpin,"which added a conformal spine fuel tank, those...",center
996,t2_4vpin,China truly was following its no first use dec...,center
997,t2_4vpin,essentially sonic booms as they operate) and i...,center
998,t2_4vpin,"you do, the weight of ice along with the body ...",center


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_politics['post'], df_politics[FEATURE],test_size= 0.4)

In [6]:
nlp = spacy.load("en_core_web_sm")

## 1. BoW SVC model

In [7]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('svc', LinearSVC())
])

In [8]:
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

      center       1.00      0.97      0.99       306
        left       1.00      0.73      0.84        11
       right       0.88      0.99      0.93        83

    accuracy                           0.97       400
   macro avg       0.96      0.90      0.92       400
weighted avg       0.97      0.97      0.97       400



## 2. BoW SVC with EDA
### 2.1 Data augmentation

In [9]:
nltk.download('wordnet')

class SynonymReplacementTransformer(TransformerMixin):
    def __init__(self, p=0.2):
        self.p = p

    def synonym_replacement(self, sentence):
        words = sentence.split()

        self.n = int(self.p * len(sentence))
        for _ in range(self.n):
            idx = random.randint(0, len(words) - 1)
            word = words[idx]
            synonyms = [syn.name() for syn in wordnet.synsets(word)]
            if synonyms:
                replacement = random.choice(synonyms)
                words[idx] = replacement
        return ' '.join(words)

    def transform(self, X, y=None):
        return [self.synonym_replacement(sentence) for sentence in X]
    
class RandomInsertionTransformer(TransformerMixin):
    def __init__(self, p=0.2):
        self.p = p

    def random_insertion(self, sentence):
        words = sentence.split()
        self.n = int(self.p * len(sentence))
        for _ in range(self.n):
            idx = random.randint(0, len(words) - 1)
            word = words[idx]
            
            # Get synonyms of the word that are not stop words
            synonyms = [syn.name() for syn in wordnet.synsets(word) if syn.name() not in nltk.corpus.stopwords.words('english')]
            
            if synonyms:
                synonym = random.choice(synonyms)
                words.insert(random.randint(0, len(words)), synonym)
        return ' '.join(words)

    def transform(self, X, y=None):
        return [self.random_insertion(sentence) for sentence in X]

class RandomSwapTransformer(TransformerMixin):
    def __init__(self, p=0.2):
        self.p = p

    def random_swap(self, sentence):
        words = sentence.split()
        self.n = int(self.p * len(sentence))
        for _ in range(self.n):
            idx1, idx2 = random.sample(range(len(words)), 2)
            words[idx1], words[idx2] = words[idx2], words[idx1]
        return ' '.join(words)

    def transform(self, X, y=None):
        return [self.random_swap(sentence) for sentence in X]

class RandomDeletionTransformer(TransformerMixin):
    def __init__(self, p=0.2):
        self.p = p

    def random_deletion(self, sentence):
        words = sentence.split()
        words = [word for word in words if random.uniform(0, 1) > self.p]
        return ' '.join(words)

    def transform(self, X, y=None):
        return [self.random_deletion(sentence) for sentence in X]

[nltk_data] Downloading package wordnet to /home/egor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# p - % of the sentence to be augmented (good values - 1%, 2%, 5%, 10%, 20%)

synonym_replacement_transformer = SynonymReplacementTransformer(p=0.2)
random_insertion_transformer = RandomInsertionTransformer(p=0.2)
random_swap_transformer = RandomSwapTransformer(p=0.2)
random_deletion_transformer = RandomDeletionTransformer(p=0.2)

In [11]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

n_aug = 4 # The number of augmented instances of a sentence (good values - 2, 4, 8, 16)

X_train_augmented = []
y_train_augmented = []
for sentence, label in tqdm(zip(X_train, y_train)):
    # augmented_sentences = [sentence]  # Keep the original sentence
    augmented_sentences = []
    sentence = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

    for _ in range(n_aug):
        chosen_operation = random.choice(['SR', 'RI' 'RS', 'RD'])  

        if chosen_operation == 'SR':
            augmented_sentences.append(synonym_replacement_transformer.transform([sentence])[0])
        elif chosen_operation == 'RI':
            # augmented_sentences.append(random_insertion_transformer.transform([sentence])[0]) # Insertions take too long
            augmented_sentences.append(sentence)
        elif chosen_operation == 'RS':
            augmented_sentences.append(random_swap_transformer.transform([sentence])[0])
        elif chosen_operation == 'RD':
            augmented_sentences.append(random_deletion_transformer.transform([sentence])[0])

    X_train_augmented.extend(augmented_sentences)
    y_train_augmented.extend([label] * len(augmented_sentences))


0it [00:00, ?it/s]

### 2.2 BoW SVC with EDA model training and evaluation

In [12]:
# Your pipeline with data augmentation
clfEDA = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('svc', LinearSVC())
])

# Fit the model with augmented data
clfEDA.fit(X_train_augmented, y_train_augmented)



In [13]:
y_pred = clfEDA.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      center       1.00      0.97      0.98       306
        left       1.00      0.73      0.84        11
       right       0.86      0.99      0.92        83

    accuracy                           0.96       400
   macro avg       0.95      0.89      0.92       400
weighted avg       0.97      0.96      0.97       400

