In [None]:
!pip install spacy



In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import re
import string
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# define a function to retrieve the relevant syntactic features
def syntactic_features_en(tweet):
    doc = nlp(tweet)

    # 1. clauses per Sentence
    sentence_count = len(list(doc.sents))
    clause_count = sum(1 for token in doc if token.dep_ in {"csubj", "ccomp", "advcl", "acl", "relcl"})
    clause_per_sentence = clause_count / sentence_count if sentence_count > 0 else 0


    # 2. count of imperative sentences
    imperative_count = sum(1 for sent in doc.sents if len(sent) > 0 and sent[0].pos_ == "VERB" and sent[0].tag_ == "VB")

    # 3. count of passive voice usage
    passive_count = sum(
        1 for token in doc if token.dep_ == "nsubjpass" and any(child.dep_ == "auxpass" for child in token.head.children)
    )

    # 4. ratio of women-related gendered pronouns to total pronouns
    pronouns = [token.text.lower() for token in doc if token.pos_ in {"PRON"}]
    women_gendered_pronouns = {'she', 'her', 'hers'}
    gendered_count = sum(1 for pronoun in pronouns if pronoun in women_gendered_pronouns)
    total_pronouns = len(pronouns)
    gendered_pronoun_ratio = gendered_count / total_pronouns if total_pronouns > 0 else 0

    # 5. count of negations
    neg_count = sum(1 for token in doc if token.dep_ == "neg")

    return [clause_per_sentence,
            imperative_count,
            passive_count,
            gendered_pronoun_ratio,
            neg_count]

In [None]:
# load training text data
en_training_dataset = pd.read_csv('train_en_dataset.csv')
en_training_text = en_training_dataset['tweet'].tolist()
en_training_label = en_training_dataset['value'].tolist()

In [None]:
# get the syntactic features for each tweet
en_X_train = [syntactic_features_en(text) for text in en_training_text]
en_Y_train = en_training_label

In [None]:
# train the logistic regression model
LR = LogisticRegression(max_iter=100000, class_weight='balanced') # since the dataset is slightly imbalanced, the 'class_weight' is set to 'balanced'
LR.fit(en_X_train, en_Y_train)

In [None]:
# load test text data
en_test_dataset = pd.read_csv('test_en_dataset.csv')
en_test_text = en_test_dataset['tweet'].tolist()
en_test_label = en_test_dataset['value'].tolist()

In [None]:
# extract features dynamically for validation
en_X_test = [syntactic_features_en(text) for text in en_test_text]
en_Y_test = en_test_label

In [None]:
# validate the model and calculate accuracy and F1 score
y_pred = LR.predict(en_X_test)
acc = accuracy_score(en_Y_test, y_pred)
f1 = f1_score(en_Y_test, y_pred, average='binary')

In [None]:
acc, f1

(0.6153846153846154, 0.5124282982791587)

In [None]:
LR.coef_

array([[ 0.09062515, -0.01115761, -0.01282267,  1.56755249,  0.22927649]])