In [1]:
import pandas as pd
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load the data
df = pd.read_csv('data\BengalihatespeechFull.csv')
df.head()

Unnamed: 0,sentence,hate,category
0,যত্তসব পাপন শালার ফাজলামী!!!!!,1,sports
1,পাপন শালা রে রিমান্ডে নেওয়া দরকার,1,sports
2,জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...,1,sports
3,শালা লুচ্চা দেখতে পাঠার মত দেখা যায়,1,sports
4,তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব,1,sports


In [3]:
import spacy

In [4]:
nlp = spacy.blank("bn")

In [5]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.text)
    
    return " ".join(filtered_tokens) 

In [6]:
stop_words = nlp.Defaults.stop_words

In [7]:
df['preprocessed_sentence'] = df['sentence'].apply(preprocess) 

In [8]:
import re

def remove_emojis(text):
    # Define a regex pattern for emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text).strip()

In [9]:
# Function to remove HTML tags
def remove_html_tag(text):
    br_pattern = re.compile(r'<\s*br\s*/?>,*,?')
    return br_pattern.sub('', text)

In [10]:
df['preprocessed_sentence'] = df['preprocessed_sentence'].apply(remove_emojis)

In [11]:
df['preprocessed_sentence'] = df['preprocessed_sentence'].apply(remove_html_tag)

In [12]:
df = df.drop(['sentence', 'category'], axis=1)
df.head(3)

Unnamed: 0,hate,preprocessed_sentence
0,1,যত্তসব পাপন শালার ফাজলামী
1,1,পাপন শালা রে রিমান্ডে দরকার
2,1,জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ ভাবতে প...


In [13]:
df.isnull().sum()

hate                     0
preprocessed_sentence    0
dtype: int64

In [14]:
df['hate'] = df['hate'].fillna(-1)

In [15]:
df.isnull().sum()

hate                     0
preprocessed_sentence    0
dtype: int64

In [16]:
X = df['preprocessed_sentence']
y = df['hate']

In [17]:
len(X), len(y)

(30000, 30000)

In [18]:
y[15000:15010]

15000    0
15001    0
15002    0
15003    0
15004    0
15005    0
15006    0
15007    0
15008    0
15009    0
Name: hate, dtype: int64

In [21]:
from sklearn.utils import shuffle

In [22]:
X, y = shuffle(X, y, random_state=42)

In [23]:
y[15000:15010]

12842    0
2861     1
28840    0
14100    0
10723    0
10375    0
8801     1
17295    0
3322     1
9357     1
Name: hate, dtype: int64

In [24]:
y[15001]

0

In [25]:
y_true = y.copy()
y[5000:] = -1

In [26]:
y[20:130]

17636    0
3931     1
8387     1
8028     1
9482     1
        ..
19637    0
18294    0
2876     1
2982     1
17410    0
Name: hate, Length: 110, dtype: int64

In [30]:
y[15000:15010]

12842   -1
2861    -1
28840   -1
14100   -1
10723   -1
10375   -1
8801    -1
17295   -1
3322    -1
9357    -1
Name: hate, dtype: int64

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

In [33]:
import numpy as np

In [34]:
x_values = np.arange(0.4, 1.05, 0.05)
x_values = np.append(x_values, 0.99999)

In [35]:
base_classifier = SVC(probability=True, gamma=0.001, random_state=42)

In [None]:
for i, threshold in enumerate(x_values):
    self_training_clf = SelfTrainingClassifier(base_classifier, threshold=threshold)
    
    st_clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('st', self_training_clf)         
    ])

    # We need manual cross validation so that we don't treat -1 as a separate
    # class when computing accuracy
    skfolds = StratifiedKFold(n_splits=3)
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        y_test_true = y_true[test_index]

        st_clf.fit(X_train, y_train)

        # The amount of labeled samples that at the end of fitting
#         amount_labeled[i, fold] = (
#             total_samples
#             - np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0]
#         )
        # The last iteration the classifier labeled a sample in
#         amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)

        y_pred = st_clf.predict(X_test)
#         scores[i, fold] = accuracy_score(y_test_true, y_pred)
#         print(fold)
#         print(accuracy_score(y_test_true, y_pred))
#         print(len(y_train))
        print(fold)
        print(accuracy_score(y_test_true, y_pred))

0
0.3333
1
0.8333
2
0.8334
0
0.3333
1
0.8333
2
0.8334
0
0.3333
1
0.8333
2
0.8334
0
0.3333
1
0.8333
2
0.8334
0
0.3333
