In [70]:
import string
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, \
    TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [27]:
data = pd.read_csv('cyberbullying_tweets.csv')
data.sample(5)

Unnamed: 0,tweet_text,cyberbullying_type
21338,@hassanrahman11 @ToAllahWeReturn One Christ te...,religion
19561,Do you think I hate the muslims? I as the 90% ...,religion
44000,"@fuck_illuminiti Message To ALL y'all, STUPID ...",ethnicity
9563,GOP Congressman calls a female Congresswoman a...,gender
17929,Since when did religious freedom become radica...,religion


In [6]:
data.shape

(47692, 2)

In [4]:
data['cyberbullying_type'].value_counts()

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['tweet_text'],
                                                    data['cyberbullying_type'],
                                                    test_size=0.3,
                                                    random_state=42)

In [29]:
vectorizer = CountVectorizer()
train_corpus = ''

vectorizer.fit(X_train)  # "fit" vectorizer on train+test

In [30]:
len(vectorizer.get_feature_names_out())

48382

In [31]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [32]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_bow, y_train)

y_pred = classifier.predict(X_test_bow)

In [35]:
pres = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('precision BoW Score = {0}'.format(pres))
print('recall BoW Score = {0}'.format(recall))
print('F1 BoW Score = {0}'.format(f1))

precision BoW Score = 0.8283853763342308
recall BoW Score = 0.8240844282918647
F1 BoW Score = 0.8259655798807278


# Очистка

In [62]:
def clean_text(text):
    import re
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk import WordNetLemmatizer

    morph = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(
        text.lower(),
        language='english')

    text_cleaned = []
    for token in text:
        token = token.lower().strip()
        t_clean = morph.lemmatize(token)
        if len(t_clean) > 2 and (t_clean not in stop_words):
            text_cleaned.append(t_clean)
    text = " ".join(text_cleaned)
    return text

In [65]:
train_cleaned = X_train.apply(lambda x:clean_text(x))
test_cleaned = X_test.apply(lambda x:clean_text(x))

X_train.iloc[0], train_cleaned.iloc[0]

("@MarleyGotBoobs gays r not the sissi's st8 men r. We run from the prison rape ! It's just a good day for them ##joke",
 'marleygotboobs gay sissi men run prison rape good day joke')

In [67]:
vectorizer = CountVectorizer()

train_bow = vectorizer.fit_transform(train_cleaned)
test_bow = vectorizer.transform(test_cleaned)
len(vectorizer.get_feature_names_out())

42970

In [68]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_bow, y_train)

y_pred = classifier.predict(test_bow)
pres = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('precision BoW Score = {0}'.format(pres))
print('recall BoW Score = {0}'.format(recall))
print('F1 BoW Score = {0}'.format(f1))

precision BoW Score = 0.8273891929275429
recall BoW Score = 0.8228263908303047
F1 BoW Score = 0.8246784982180299


# TF-IDF

In [95]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=0.00003)
train_tfidf = tfidf.fit_transform(train_cleaned)
test_tfidf = tfidf.transform(test_cleaned)
len(tfidf.get_feature_names_out())

56832

In [96]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_tfidf, y_train)

y_pred = classifier.predict(test_tfidf)
pres = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('precision BoW Score = {0}'.format(pres))
print('recall BoW Score = {0}'.format(recall))
print('F1 BoW Score = {0}'.format(f1))

precision BoW Score = 0.8318488491751882
recall BoW Score = 0.8296757058987979
F1 BoW Score = 0.830020749866719
