In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
TV = TfidfVectorizer(max_features = 1500) 

In [2]:
import datasets

train_dataset = datasets.load_dataset('social_bias_frames',split="train[:2000]")
valid_dataset = datasets.load_dataset('social_bias_frames',split="validation[:1600]")
test_dataset = datasets.load_dataset('social_bias_frames',split="test[:1700]")

In [3]:
train_df = train_dataset.to_pandas()
train_df = train_df[train_df['offensiveYN'] != '']
train_df.loc[train_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

train_dataset = datasets.Dataset.from_pandas(train_df)

val_df = valid_dataset.to_pandas()
val_df = val_df[val_df['offensiveYN'] != '']
val_df.loc[val_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

val_dataset = datasets.Dataset.from_pandas(val_df)

test_df = test_dataset.to_pandas()
test_df = test_df[test_df['offensiveYN'] != '']
test_df.loc[test_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

test_dataset = datasets.Dataset.from_pandas(test_df)

In [4]:
X_train = TV.fit_transform(train_dataset['post']).toarray()
X_val = TV.fit_transform(val_dataset['post']).toarray()
X_test = TV.fit_transform(test_dataset['post']).toarray()

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train=label_encoder.fit_transform(train_dataset['offensiveYN'])

label_encoder = LabelEncoder()
y_val=label_encoder.fit_transform(val_dataset['offensiveYN'])

label_encoder = LabelEncoder()
y_test=label_encoder.fit_transform(test_dataset['offensiveYN'])

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
dt = RandomForestClassifier()
dt.fit(X_train, y_train)

In [7]:
from sklearn.metrics import classification_report
y_pred = dt.predict(X_train)
print(classification_report(y_train , y_pred))

              precision    recall  f1-score   support

           0       0.77      0.69      0.73       383
           1       0.93      0.95      0.94      1575

    accuracy                           0.90      1958
   macro avg       0.85      0.82      0.83      1958
weighted avg       0.90      0.90      0.90      1958



In [9]:
y_pred = dt.predict(X_val)
print(classification_report(y_val, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.55      0.03      0.06       540
           1       0.66      0.99      0.79      1033

    accuracy                           0.66      1573
   macro avg       0.60      0.51      0.43      1573
weighted avg       0.62      0.66      0.54      1573



In [10]:
y_pred = dt.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.22      0.02      0.04       604
           1       0.63      0.96      0.76      1065

    accuracy                           0.62      1669
   macro avg       0.42      0.49      0.40      1669
weighted avg       0.48      0.62      0.50      1669



In [11]:
from sklearn. ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(DecisionTreeClassifier())
adb.fit(X_train,y_train)

In [12]:
y_pred = adb.predict(X_train)
print(classification_report(y_train , y_pred))

              precision    recall  f1-score   support

           0       0.76      0.70      0.73       383
           1       0.93      0.95      0.94      1575

    accuracy                           0.90      1958
   macro avg       0.85      0.83      0.84      1958
weighted avg       0.90      0.90      0.90      1958



In [13]:
y_pred = adb.predict(X_val)
print(classification_report(y_val , y_pred))

              precision    recall  f1-score   support

           0       0.44      0.13      0.20       540
           1       0.67      0.92      0.77      1033

    accuracy                           0.65      1573
   macro avg       0.55      0.52      0.48      1573
weighted avg       0.59      0.65      0.57      1573



In [14]:
y_pred = adb.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.38      0.15      0.22       604
           1       0.64      0.86      0.73      1065

    accuracy                           0.60      1669
   macro avg       0.51      0.51      0.48      1669
weighted avg       0.55      0.60      0.55      1669



In [15]:
from sklearn import svm
from sklearn.metrics import classification_report

clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

In [16]:
y_pred = clf.predict(X_train)
report = classification_report(y_train, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.40      0.54       383
           1       0.87      0.98      0.92      1575

    accuracy                           0.87      1958
   macro avg       0.86      0.69      0.73      1958
weighted avg       0.87      0.87      0.85      1958



In [17]:
y_pred = clf.predict(X_val)
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.22      0.03      0.05       540
           1       0.65      0.94      0.77      1033

    accuracy                           0.63      1573
   macro avg       0.43      0.49      0.41      1573
weighted avg       0.50      0.63      0.52      1573



In [18]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.30      0.07      0.11       604
           1       0.63      0.91      0.75      1065

    accuracy                           0.61      1669
   macro avg       0.46      0.49      0.43      1669
weighted avg       0.51      0.61      0.52      1669

