In [None]:
!pip install jieba



In [None]:
!pip install snownlp

Collecting snownlp
  Downloading snownlp-0.12.3.tar.gz (37.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: snownlp
  Building wheel for snownlp (setup.py) ... [?25l[?25hdone
  Created wheel for snownlp: filename=snownlp-0.12.3-py3-none-any.whl size=37760946 sha256=fb16ed0b93edb8299910754fef49840e1067ae1e7a18697b7b98e4b91e3f6fdb
  Stored in directory: /root/.cache/pip/wheels/43/f3/70/8990fc249efeb396007766676706f71dd3d1ca3c023ce522ce
Successfully built snownlp
Installing collected packages: snownlp
Successfully installed snownlp-0.12.3


In [None]:
import jieba
import re
from snownlp import SnowNLP
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# load the Chinese sexist lexicon
with open('SexHateLex.txt', 'r', encoding='utf-8') as file:
    SexHateLexicon = [line.strip() for line in file]

In [None]:
# define the punctuation pattern for later use
punctuation_pattern = r"[，。？！：；、“”‘’（）《》【】——……—·,.\?!:;'\"()<>]"

In [None]:
# get the hand-crafted stylistic features
def stylistic_features_zh(text):
    # tokenize the Chinese weibo using jieba tokenlization package
    tokens = list(jieba.cut(text))
    # 1. work number in each tweet
    token_num_per_tweet = len(tokens)

    # 2. avg word length in each tweet
    char_num_per_tweet = sum(len(token) for token in tokens)
    avg_char_num_per_token = char_num_per_tweet / token_num_per_tweet if token_num_per_tweet != 0 else 0

    # 3. number of sentences in each tweet
    sentences = re.split(r'[。！？]', text)
    sentences = [s for s in sentences if s.strip()]
    sentence_num = len(sentences)

    # 4. number of hastags
    hashtag_num = len(re.findall(r'#(?!URL\b)\w+', text))

    # 5. number of mentions
    mention_num = text.count('@username')

    # 6. number of links
    link_num = text.count('#URL')

    # 7. sentiment analysis
    s = SnowNLP(text)
    sentiment_compound = s.sentiments  # from 0 to 1

    # 8. sexism word frequency statistics
    sexwords_count = len([token for token in tokens if token in SexHateLexicon])
    # 9. ratio of sexist word in a tweet
    sexwords_ratio = sexwords_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 10. number of all punctuations of each tweet
    punctuation_count = len(re.findall(punctuation_pattern, text))

    # 11. ratio of punctuations in relation to the number of words
    punctuation_ratio = punctuation_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 12. number of exclamation marks
    exclamation_count = text.count('！')

    # 13. ratio of exclamation marks
    exclamation_ratio = exclamation_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # 14. number of question marks
    question_count = text.count('？')

    # 15. ratio of question marks
    question_ratio = question_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    # count of emojis in each tweet
    emoji_count = len(re.findall(r':[^:]+?:', text))

    # 17. emoji ratio
    emoji_ratio = emoji_count / token_num_per_tweet if token_num_per_tweet > 0 else 0

    return [token_num_per_tweet,
            avg_char_num_per_token,
            sentence_num,
            hashtag_num,
            mention_num,
            link_num,
            sentiment_compound,
            sexwords_count,
            sexwords_ratio,
            punctuation_count,
            punctuation_ratio,
            exclamation_count,
            exclamation_ratio,
            question_count,
            question_ratio,
            emoji_count,
            emoji_ratio]


In [None]:
# load training text chinese data
zh_training_dataset = pd.read_csv('train_zh_dataset.csv')
zh_training_tweet = zh_training_dataset['comment_text'].tolist()
zh_training_label = zh_training_dataset['label'].tolist()

In [None]:
# calculate the features for each tweet
zh_X_train = [stylistic_features_zh(tweet) for tweet in zh_training_tweet]
zh_Y_train = zh_training_label

In [None]:
# train the logistic regression model
LR = LogisticRegression(max_iter=100000, class_weight='balanced') # since the dataset is slightly imbalanced, the 'class_weight' is set to 'balanced'
LR.fit(zh_X_train, zh_Y_train)

In [None]:
# load test text data
zh_test_dataset = pd.read_csv('test_zh_dataset.csv')
zh_test_tweet = zh_test_dataset['comment_text'].tolist()
zh_test_label = zh_test_dataset['label'].tolist()

In [None]:
# extract features dynamically for validation
zh_X_test = [stylistic_features_zh(tweet) for tweet in zh_test_tweet]
zh_Y_test = zh_test_label

In [None]:
# validate the model and calculate accuracy and F1 score
y_pred = LR.predict(zh_X_test)
acc = accuracy_score(zh_Y_test, y_pred)
f1 = f1_score(zh_Y_test, y_pred, average='binary')

In [None]:
acc, f1

(0.6644370122630993, 0.5244865718799369)

In [None]:
print(LR.coef_)

[[-0.00191387 -0.37013425 -0.00561829 -0.34020926  0.          0.
   0.12427802  0.45559793  1.8346602   0.03787762 -0.97371502 -0.02772638
  -1.02073478 -0.01630056 -0.60504461  0.21352659  0.42168216]]
