In [None]:
!pip install vaderSentiment



In [None]:
!pip install --upgrade pip



In [None]:
!pip install numpy==1.26.3



In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import string


In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
# women-hatred lexicon - eng
en_lex_df = pd.read_csv('hurtlex_EN.tsv', sep='\t')
en_lex = en_lex_df['lemma'].tolist()

## Using stylistic feature to detct the sexist intention in English tweets

In [None]:
# define a function to capture all the syntactic features that might be helpful for detecting sexism
def stylistic_features_en(tweet):
    # 1. token count in a tweet
    tokens = nltk.word_tokenize(tweet)
    token_num_per_tweet = len(tokens)

    # 2. average number of chracters of words in a tweet
    # to show the complexity of the choice of word in each tweet
    char_num_per_tweet = sum(len(token) for token in tokens)
    avg_char_num_per_token = char_num_per_tweet / token_num_per_tweet if token_num_per_tweet != 0 else 0

    # 3. sentence count in a tweet
    sentences = nltk.sent_tokenize(tweet)
    sentence_num = len(sentences)

    # average number of chracters of sentences in a tweet
    # char_num_per_tweet = sum(len(token) for token in tokens)
    # if sentence_num == 0:
    #     avg_char_num_per_sentence = 0
    # else:
    #     avg_char_num_per_sentence = char_num_per_tweet / sentence_num

    # 4. number of hastags
    hashtag_num = len(re.findall(r'#(?!URL\b)\w+', tweet))

    # 5. number of mentions
    mention_num = tweet.count('@username')

    # 6. number of links
    link_num = tweet.count('#URL')

    # 7. sentiment feature of each tweet using VADER
    sentiment_scores = analyzer.polarity_scores(tweet)
    sentiment_compound = sentiment_scores['compound']  # compound score from -1 to 1

    # 8. number of seixst words of each tweet in sexism lexicon
    sexwords_count = len([token for token in tokens if token.lower() in en_lex])

    # 9. ratio of sexist word in a tweet
    sexwords_ratio = sexwords_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 10. number of all punctuations of each tweet
    punctuation_count = sum(1 for char in tweet if char in string.punctuation)

    # 11. ratio of punctuations in relation to the number of words
    punctuation_ratio = punctuation_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 12. number of exclamation marks
    exclamation_count = tweet.count('!')

    # 13. ratio of exclamation marks
    exclamation_ratio = exclamation_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 14. number of question marks
    question_count = tweet.count('?')

    # 15. ratio of question marks
    question_ratio = question_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 16. number of emojis in each tweet
    emoji_count = len(re.findall(r':[^:]+?:', tweet))

    # 17. emoji ratio
    emoji_ratio = emoji_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    return [token_num_per_tweet,
            avg_char_num_per_token,
            sentence_num,
            hashtag_num,
            mention_num,
            link_num,
            sentiment_compound,
            sexwords_count,
            sexwords_ratio,
            punctuation_count,
            punctuation_ratio,
            exclamation_count,
            exclamation_ratio,
            question_count,
            question_ratio,
            emoji_count,
            emoji_ratio]


In [None]:
# load training text data
en_training_dataset = pd.read_csv('train_en_dataset.csv')
en_training_text = en_training_dataset['tweet'].tolist()
en_training_label = en_training_dataset['value'].tolist()

In [None]:
en_X_train = [stylistic_features_en(text) for text in en_training_text]
en_Y_train = en_training_label

In [None]:
# train the logistic regression model
LR = LogisticRegression(max_iter=100000, class_weight='balanced') # since the dataset is slightly imbalanced, the 'class_weight' is set to 'balanced'
LR.fit(en_X_train, en_Y_train)

In [None]:
# load test text data
en_test_dataset = pd.read_csv('test_en_dataset.csv')
en_test_text = en_test_dataset['tweet'].tolist()
en_test_label = en_test_dataset['value'].tolist()

In [None]:
# extract features dynamically for validation
en_X_test = [stylistic_features_en(text) for text in en_test_text]
en_Y_test = en_test_label

In [None]:
# validate the model and calculate accuracy and F1 score
y_pred = LR.predict(en_X_test)
acc = accuracy_score(en_Y_test, y_pred)
f1 = f1_score(en_Y_test, y_pred, average='binary')

In [None]:
acc, f1

(0.6003016591251885, 0.5691056910569106)

In [None]:
LR.coef_

array([[-0.00422917, -0.31443202, -0.0442698 , -0.0576277 , -0.02647436,
        -0.25067744, -0.38769354,  0.10772924,  0.54416285,  0.01497085,
         0.3551094 , -0.04678978, -0.31184163,  0.09956017,  0.20518468,
         0.01917784,  0.30154435]])