In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("cleaned_english_dataset.csv")

In [18]:
df.head()

Unnamed: 0,English Text,Bias,Text Embedding
0,chief of staff general herzi halevy the battle...,Unclear,"[-0.011526022, 0.02747922, -0.038142346, -0.05..."
1,israel is on high alert following the assassi...,Unbiased,"[0.026821168, -0.0059844996, -0.03425517, -0.0..."
2,special coverage of zeenews from ground zero ...,Biased against Palestine,"[0.015704844, -0.002337436, -0.022777194, -0.0..."
3,highlights of macrons speech during his meeti...,Unclear,"[-0.010238404, -0.009839685, -0.05177646, -0.0..."
4,i need the west to understand something bombs ...,Unbiased,"[-0.03662164, -0.04248401, -0.05110409, -0.047..."


In [19]:
df['Text Embedding'].head(10)
df['Text Embedding'].apply(type).value_counts()

Text Embedding
<class 'str'>    14456
Name: count, dtype: int64

In [20]:
import ast
import numpy as np

def parse_embedding_safe(x):
    # Must be string AND look like a list
    if not isinstance(x, str):
        return None
    
    x = x.strip()
    if not x.startswith('[') or not x.endswith(']'):
        return None

    try:
        vec = ast.literal_eval(x)
        return np.array(vec, dtype=np.float32)
    except Exception:
        return None

df['Text Embedding'] = df['Text Embedding'].apply(parse_embedding_safe)

In [21]:
before = len(df)
df = df[df['Text Embedding'].notnull()]
after = len(df)

print(f"Removed {before - after} rows with invalid embeddings")
print(f"Remaining rows: {after}")


Removed 36 rows with invalid embeddings
Remaining rows: 14420


In [22]:
type(df['Text Embedding'].iloc[0]), df['Text Embedding'].iloc[0].shape

(numpy.ndarray, (768,))

In [23]:
df.head()

Unnamed: 0,English Text,Bias,Text Embedding
0,chief of staff general herzi halevy the battle...,Unclear,"[-0.011526022, 0.02747922, -0.038142346, -0.05..."
1,israel is on high alert following the assassi...,Unbiased,"[0.026821168, -0.0059844996, -0.03425517, -0.0..."
2,special coverage of zeenews from ground zero ...,Biased against Palestine,"[0.015704844, -0.002337436, -0.022777194, -0.0..."
3,highlights of macrons speech during his meeti...,Unclear,"[-0.010238404, -0.009839685, -0.05177646, -0.0..."
4,i need the west to understand something bombs ...,Unbiased,"[-0.03662164, -0.04248401, -0.05110409, -0.047..."


In [30]:
X = np.stack(df['Text Embedding'].values).astype(np.float32)
Y = df['Bias'].values

In [31]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("Number of classes:", num_classes)

Classes: ['Biased against Israel' 'Biased against Palestine'
 'Biased against both Palestine and Israel' 'Biased against others'
 'Not Applicable' 'Unbiased' 'Unclear']
Number of classes: 7


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [38]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
class_weight_dict = dict(zip(classes, class_weights))



In [39]:
sample_weights = np.array([class_weight_dict[label] for label in y_train])


In [40]:
type(df['Text Embedding'].iloc[0]), len(df['Text Embedding'].iloc[0])

(numpy.ndarray, 768)

In [41]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train,
    y_train,
    sample_weight=sample_weights
)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [42]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = xgb_model.predict(X_test)

print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_,
    digits=4
))


                                          precision    recall  f1-score   support

                   Biased against Israel     0.5354    0.2804    0.3681       189
                Biased against Palestine     0.6412    0.6487    0.6449       686
Biased against both Palestine and Israel     0.0000    0.0000    0.0000         2
                   Biased against others     0.4000    0.0488    0.0870        41
                          Not Applicable     0.6000    0.1500    0.2400        20
                                Unbiased     0.5768    0.7452    0.6503      1205
                                 Unclear     0.3492    0.2470    0.2893       741

                                accuracy                         0.5492      2884
                               macro avg     0.4432    0.3029    0.3256      2884
                            weighted avg     0.5282    0.5492    0.5265      2884



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# BIAS CLASSIFICATION WITH FASTTEXT-LIKE APPROACH
# Using Pre-existing Text Embeddings
# ============================================================================

import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, f1_score, precision_score, recall_score)


# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ All libraries imported successfully")

✓ All libraries imported successfully


In [3]:
df_raw = pd.read_csv("New_data_embeddings.csv")

In [32]:
# CELL 1: DATA PREPARATION
# ============================================================================

print("\n" + "="*70)
print("PHASE 1: DATA PREPARATION")
print("="*70)

# Parse embeddings from your df_clean
def parse_embedding(emb):
    """Convert string embeddings to numpy arrays"""
    if isinstance(emb, str):
        if emb.startswith("FAILED"):
            return None
        try:
            val = eval(emb)
            if len(val) == 0:
                return None
            return np.array(val)
        except:
            return None
    if emb is not None and len(emb) == 0:
        return None
    return np.array(emb) if emb is not None else None

print("\n[1/5] Parsing embeddings...")
df_raw['Text Embedding'] = df_raw['Text Embedding'].apply(parse_embedding)

# Drop rows with failed embeddings
df_raw = df_raw.dropna(subset=['Text Embedding']).reset_index(drop=True)

# Filter by dimension consistency
expected_dim = df_raw['Text Embedding'].iloc[0].shape[0]
print(f"   Expected embedding dimension: {expected_dim}")

valid_mask = df_raw['Text Embedding'].apply(lambda x: x.shape[0] == expected_dim)
n_dropped = (~valid_mask).sum()
if n_dropped > 0:
    print(f"   WARNING: Dropping {n_dropped} rows with incorrect dimensions")
    df_raw = df_raw[valid_mask].reset_index(drop=True)

print(f"\n[2/5] Final valid samples: {len(df_raw)}")



PHASE 1: DATA PREPARATION

[1/5] Parsing embeddings...
   Expected embedding dimension: 768

[2/5] Final valid samples: 7581


In [33]:
# CELL 2: PREPARE DATA FOR MODELING
# ============================================================================

print("\n[3/5] Creating feature matrix and labels...")

# Convert embeddings to matrix
X = np.stack(df_raw['Text Embedding'].values)
print(f"   Feature matrix shape: {X.shape}")

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_raw['final_label'])
target_names = label_encoder.classes_

print(f"\n   Label encoding:")
for idx, label in enumerate(target_names):
    count = (y == idx).sum()
    print(f"   {idx}: {label} ({count} samples)")


[3/5] Creating feature matrix and labels...
   Feature matrix shape: (7581, 768)

   Label encoding:
   0: Biased against Israel (78 samples)
   1: Biased against Palestine (1255 samples)
   2: Biased against both Palestine and Israel (10 samples)
   3: Biased against others (75 samples)
   4: Unbiased (3938 samples)
   5: Unclear (2225 samples)


In [6]:
# CELL 3: TRAIN-TEST SPLIT
# ============================================================================

print(f"\n[4/5] Splitting data (80% train, 20% test)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"   Training set: {len(X_train)} samples")
print(f"   Test set: {len(X_test)} samples")
print(f"\n   Training set distribution:")
train_counts = Counter(y_train)
for idx in sorted(train_counts.keys()):
    print(f"   {target_names[idx]}: {train_counts[idx]}")


[4/5] Splitting data (80% train, 20% test)...
   Training set: 6064 samples
   Test set: 1517 samples

   Training set distribution:
   Biased against Israel: 62
   Biased against Palestine: 1004
   Biased against both Palestine and Israel: 8
   Biased against others: 60
   Unbiased: 3150
   Unclear: 1780


In [21]:
from imblearn.over_sampling import SMOTE

# CELL 4: APPLY BORDERLINE-SMOTE
# ============================================================================

print(f"\n[5/5] Applying Borderline-SMOTE for class balance...")
print(f"   Original training distribution: {dict(Counter(y_train))}")

# Determine appropriate k_neighbors based on smallest class
min_class_size = min(Counter(y_train).values())
k_neighbors = min(5, min_class_size - 1)

if k_neighbors < 1:
    print(f"   ⚠ Warning: Smallest class too small for SMOTE. Using original data.")
    X_train_balanced = X_train
    y_train_balanced = y_train
else:
    smote = SMOTE(random_state=42, k_neighbors=min(5, min_class_size - 1))
    
    try:
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print(f"   Balanced training distribution: {dict(Counter(y_train_balanced))}")
        print(f"   New training set size: {len(X_train_balanced)}")
    except Exception as e:
        print(f"   ⚠ SMOTE failed: {e}")
        print(f"   Using original unbalanced data")
        X_train_balanced = X_train
        y_train_balanced = y_train

print("\n✓ Data preparation complete!")

# Replace BorderlineSMOTE with regular SMOTE for a more aggressive approach


[5/5] Applying Borderline-SMOTE for class balance...
   Original training distribution: {np.int64(5): 1780, np.int64(4): 3150, np.int64(1): 1004, np.int64(3): 60, np.int64(0): 62, np.int64(2): 8}
   Balanced training distribution: {np.int64(5): 3150, np.int64(4): 3150, np.int64(1): 3150, np.int64(3): 3150, np.int64(0): 3150, np.int64(2): 3150}
   New training set size: 18900

✓ Data preparation complete!


In [22]:
# CELL 5: MODEL 1 - SVM (SUPPORT VECTOR MACHINE)
# ============================================================================

print("\n" + "="*70)
print("PHASE 2: MODEL TRAINING AND EVALUATION")
print("="*70)
print("\nMODEL 1: SUPPORT VECTOR MACHINE (SVM)")
print("-" * 70)

print("\n[Training SVM...]")
svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    random_state=42,
    verbose=0
)

svm_model.fit(X_train_balanced, y_train_balanced)
y_pred_svm = svm_model.predict(X_test)

# Evaluation
print("\n" + "="*50)
print("SVM RESULTS:")
print("="*50)
print(classification_report(y_test, y_pred_svm, target_names=target_names, zero_division=0))

# Calculate metrics
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1_macro = f1_score(y_test, y_pred_svm, average='macro', zero_division=0)
svm_f1_weighted = f1_score(y_test, y_pred_svm, average='weighted', zero_division=0)

print(f"\nOverall Metrics:")
print(f"   Accuracy: {svm_accuracy:.4f}")
print(f"   F1-Score (Macro): {svm_f1_macro:.4f}")
print(f"   F1-Score (Weighted): {svm_f1_weighted:.4f}")


PHASE 2: MODEL TRAINING AND EVALUATION

MODEL 1: SUPPORT VECTOR MACHINE (SVM)
----------------------------------------------------------------------

[Training SVM...]

SVM RESULTS:
                                          precision    recall  f1-score   support

                   Biased against Israel       0.25      0.06      0.10        16
                Biased against Palestine       0.28      0.18      0.22       251
Biased against both Palestine and Israel       0.00      0.00      0.00         2
                   Biased against others       0.00      0.00      0.00        15
                                Unbiased       0.55      0.81      0.65       788
                                 Unclear       0.35      0.16      0.22       445

                                accuracy                           0.50      1517
                               macro avg       0.24      0.20      0.20      1517
                            weighted avg       0.44      0.50      0.44      

In [23]:
# CELL 6: MODEL 2 - RANDOM FOREST
# ============================================================================

print("\n" + "-"*70)
print("MODEL 2: RANDOM FOREST CLASSIFIER")
print("-" * 70)

print("\n[Training Random Forest...]")
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

rf_model.fit(X_train_balanced, y_train_balanced)
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("\n" + "="*50)
print("RANDOM FOREST RESULTS:")
print("="*50)
print(classification_report(y_test, y_pred_rf, target_names=target_names, zero_division=0))

# Calculate metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1_macro = f1_score(y_test, y_pred_rf, average='macro', zero_division=0)
rf_f1_weighted = f1_score(y_test, y_pred_rf, average='weighted', zero_division=0)

print(f"\nOverall Metrics:")
print(f"   Accuracy: {rf_accuracy:.4f}")
print(f"   F1-Score (Macro): {rf_f1_macro:.4f}")
print(f"   F1-Score (Weighted): {rf_f1_weighted:.4f}")

# Feature importance (top 20)
print("\n[Feature Importance Analysis...]")
importances = rf_model.feature_importances_
top_features_idx = np.argsort(importances)[-20:][::-1]
print(f"   Top 20 most important embedding dimensions:")
for i, idx in enumerate(top_features_idx[:10]):
    print(f"   Dimension {idx}: {importances[idx]:.6f}")



----------------------------------------------------------------------
MODEL 2: RANDOM FOREST CLASSIFIER
----------------------------------------------------------------------

[Training Random Forest...]

RANDOM FOREST RESULTS:
                                          precision    recall  f1-score   support

                   Biased against Israel       0.00      0.00      0.00        16
                Biased against Palestine       0.24      0.14      0.18       251
Biased against both Palestine and Israel       0.00      0.00      0.00         2
                   Biased against others       0.00      0.00      0.00        15
                                Unbiased       0.55      0.75      0.63       788
                                 Unclear       0.36      0.23      0.28       445

                                accuracy                           0.48      1517
                               macro avg       0.19      0.19      0.18      1517
                            we

In [29]:
# CELL 7: MODEL 3 - XGBOOST
# ============================================================================

print("\n" + "-"*70)
print("MODEL 3: XGBOOST CLASSIFIER")
print("-" * 70)

print("\n[Training XGBoost...]")

xgb_model = XGBClassifier(
    n_estimator=250,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

xgb_model.fit(X_train_balanced, y_train_balanced)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
print("\n" + "="*50)
print("XGBOOST RESULTS:")
print("="*50)
print(classification_report(y_test, y_pred_xgb, target_names=target_names, zero_division=0))

# Calculate metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1_macro = f1_score(y_test, y_pred_xgb, average='macro', zero_division=0)
xgb_f1_weighted = f1_score(y_test, y_pred_xgb, average='weighted', zero_division=0)

print(f"\nOverall Metrics:")
print(f"   Accuracy: {xgb_accuracy:.4f}")
print(f"   F1-Score (Macro): {xgb_f1_macro:.4f}")
print(f"   F1-Score (Weighted): {xgb_f1_weighted:.4f}")

print("\n✓ All models trained and evaluated!")



----------------------------------------------------------------------
MODEL 3: XGBOOST CLASSIFIER
----------------------------------------------------------------------

[Training XGBoost...]

XGBOOST RESULTS:
                                          precision    recall  f1-score   support

                   Biased against Israel       0.11      0.06      0.08        16
                Biased against Palestine       0.26      0.18      0.21       251
Biased against both Palestine and Israel       0.00      0.00      0.00         2
                   Biased against others       0.00      0.00      0.00        15
                                Unbiased       0.54      0.66      0.60       788
                                 Unclear       0.31      0.26      0.28       445

                                accuracy                           0.45      1517
                               macro avg       0.20      0.19      0.20      1517
                            weighted avg       0

In [30]:
## CELL 8: MODEL COMPARISON AND VISUALIZATION
# ============================================================================

print("\n" + "="*70)
print("PHASE 3: MODEL COMPARISON")
print("="*70)

# Compile results
results_df = pd.DataFrame({
    'Model': ['SVM', 'Random Forest', 'XGBoost'],
    'Accuracy': [svm_accuracy, rf_accuracy, xgb_accuracy],
    'F1-Macro': [svm_f1_macro, rf_f1_macro, xgb_f1_macro],
    'F1-Weighted': [svm_f1_weighted, rf_f1_weighted, xgb_f1_weighted]
})

print("\nComparative Results:")
print(results_df.to_string(index=False))

# Find best model
best_model_idx = results_df['F1-Weighted'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_f1 = results_df.loc[best_model_idx, 'F1-Weighted']

print(f"\n🏆 Best Model: {best_model_name} ")



PHASE 3: MODEL COMPARISON

Comparative Results:
        Model  Accuracy  F1-Macro  F1-Weighted
          SVM  0.496374  0.199333     0.442931
Random Forest  0.479235  0.181941     0.439703
      XGBoost  0.450231  0.195632     0.429366

🏆 Best Model: SVM 


In [31]:
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*70)
print("PIPELINE COMPLETE!")
print("="*70)
print(f"\nSummary:")
print(f"   Total samples processed: {len(df_raw)}")
print(f"   Training samples (balanced): {len(X_train_balanced)}")
print(f"   Test samples: {len(X_test)}")
print(f"   Number of classes: {len(target_names)}")
print(f"\n   Best model: {best_model_name}")
print(f"   Best F1-Score (Weighted): {best_f1:.4f}")
print(f"   Best Accuracy: {results_df.loc[best_model_idx, 'Accuracy']:.4f}")
print(f"\n   Models trained: SVM, Random Forest, XGBoost")
print(f"   Evaluation metrics: Accuracy, Precision, Recall, F1-Score")
print(f"   Visualization saved: model_comparison_final.png")
# Class distribution summary
print(f"\n   Class Distribution in Test Set:")
for idx, label in enumerate(target_names):
    count = (y_test == idx).sum()
    print(f"      {label}: {count}")

print("\n" + "="*70)


PIPELINE COMPLETE!

Summary:
   Total samples processed: 7581
   Training samples (balanced): 18900
   Test samples: 1517
   Number of classes: 6

   Best model: SVM
   Best F1-Score (Weighted): 0.4429
   Best Accuracy: 0.4964

   Models trained: SVM, Random Forest, XGBoost
   Evaluation metrics: Accuracy, Precision, Recall, F1-Score
   Visualization saved: model_comparison_final.png

   Class Distribution in Test Set:
      Biased against Israel: 16
      Biased against Palestine: 251
      Biased against both Palestine and Israel: 2
      Biased against others: 15
      Unbiased: 788
      Unclear: 445

