<a href="https://colab.research.google.com/github/FunmiSomoye-schl/fake_news/blob/main/Using_LLM_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, PrecisionRecallDisplay, precision_recall_curve, \
    roc_curve, roc_auc_score, average_precision_score, recall_score, precision_score

In [7]:
main_df = pd.read_excel('scored_posts.xlsx')
data = main_df[['title', 'trustworthiness']]

train_df = pd.read_csv('train_data.csv', index_col=0)
val_df = pd.read_csv('val_data.csv', index_col=0)

In [5]:
def clean_text(df, main_data):
    new = pd.merge(df, main_data, left_index=True, right_index=True)
    new = new[['title_y', 'trustworthiness_x']]
    new['title'] = new['title_y'].map(lambda x: x.encode("ascii", errors="ignore").decode())
    new.reset_index(inplace=True, drop=True)
    return new

In [7]:
new_train = clean_text(train_df, data)
new_val = clean_text(val_df, data)

In [9]:
# init embedding models
bce_model = SentenceTransformer("maidalun1020/bce-embedding-base_v1")
#bge_model = SentenceTransformer('BAAI/bge-large-zh-v1.5')



OSError: Can't load tokenizer for 'maidalun1020/bce-embedding-base_v1'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'maidalun1020/bce-embedding-base_v1' is the correct path to a directory containing all relevant files for a XLMRobertaTokenizerFast tokenizer.

In [None]:
# models = [bce_model, bge_model]
# model_names = ['bce', 'bge']

In [None]:
models = [bce_model]
model_names = ['bce']

In [None]:
def get_embeddings(df, models, model_names):
    sentences = list(df.title)
    length = len(sentences)

    for index, model in enumerate(models):
        print(f'Running model {index}')
        embeddings_df = pd.DataFrame(index=range(0,length))

        embeddings = model.encode(sentences, normalize_embeddings=True) #extract embeddings

        embeddings_df['embeddings_' + model_names[index]] = list(embeddings)
        new_df = df.join(embeddings_df)

    return new_df

In [22]:
train_embedded = get_embeddings(new_train, models, model_names)
train_embedded.to_csv('train_embedded.csv')

Running model 0


In [23]:
train_embedded

Unnamed: 0,title_y,trustworthiness_x,title,embeddings_bce
0,I love how everyone pretends the bank crisis i...,1,I love how everyone pretends the bank crisis i...,"[0.011041831, 0.007619881, -0.019065885, -0.00..."
1,NS PC gov plan to cut municipal transfers to C...,1,NS PC gov plan to cut municipal transfers to C...,"[0.011692199, 0.04249098, -0.038172554, 0.0103..."
2,"UPDATE: Highway 97 between Quesnel, Williams L...",1,"UPDATE: Highway 97 between Quesnel, Williams L...","[0.016038975, -0.026142053, -0.0061300644, 0.0..."
3,Autopsy: 'Cop City' protester had hands raised...,1,Autopsy: 'Cop City' protester had hands raised...,"[-0.0026178525, -0.03128939, 0.009863625, 0.02..."
4,J&Jâ€™s Robotic Bronchoscopy System Could Help...,1,J&Js Robotic Bronchoscopy System Could Help Ph...,"[0.01371516, 0.03488297, -0.05995533, -0.00881..."
...,...,...,...,...
147419,Santa Rosa is among the worst cities in the US...,1,Santa Rosa is among the worst cities in the US...,"[0.05363857, -0.012566487, -0.0095930975, 0.00..."
147420,Dog attack suspends mail delivery to one commu...,1,Dog attack suspends mail delivery to one commu...,"[0.01620212, 0.016872272, 0.0021904628, 0.0537..."
147421,Ice Pick Headaches are not true migraines but ...,1,Ice Pick Headaches are not true migraines but ...,"[0.01188999, 0.027421158, -0.02140186, -0.0314..."
147422,TSA has Groped and Assaulted Us for 20 Years w...,0,TSA has Groped and Assaulted Us for 20 Years w...,"[-0.013036913, -0.011813243, -0.0024818473, 0...."


In [None]:
val_embedded = get_embeddings(new_val, models, model_names)
val_embedded.to_csv('val_embedded.csv')

In [None]:
val_embedded

#ML

In [26]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [27]:
def get_x_y(data, x_col, y_col):
    cap_x = data[x_col]
    y = data[y_col]
    return cap_x, y

In [29]:
stratified_embedded = train_embedded.groupby('trustworthiness_x').apply(lambda x: x.sample(frac=0.40))
stratified_embedded.shape

(58969, 4)

In [30]:
str_cap_x, str_y = get_x_y(stratified_embedded, 'embeddings_bce', 'trustworthiness_x')
val_cap_x, val_y = get_x_y(val_embedded, 'embeddings_bce', 'trustworthiness_x')

str_cap_x_stacked = np.stack(str_cap_x)
val_x_stacked = np.stack(val_cap_x)

###SVC

In [31]:
svc = SVC(random_state=42, class_weight='balanced')

In [32]:
svc_clf = Pipeline([
    ('scaler', RobustScaler()),
    ('SVC', svc)
])

svc_clf.fit(str_cap_x_stacked, str_y)

print('TRAIN')
str_y_pred = svc_clf.predict(str_cap_x_stacked)
print(classification_report(str_y, str_y_pred))

TRAIN
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      2442
           1       1.00      0.99      0.99     56527

    accuracy                           0.99     58969
   macro avg       0.89      0.99      0.94     58969
weighted avg       0.99      0.99      0.99     58969



In [None]:
print("")
print('VAL')
val_y_pred = svc_clf.predict(val_x_stacked)
print(classification_report(val_y, val_y_pred))

###XGBoost

In [33]:
xgb_model = xgb.XGBClassifier()

In [34]:
le = LabelEncoder()
le = le.fit(str_y)

le_tr_y = le.transform(str_y)
le_val_y = le.transform(val_y)

In [35]:
xgb_clf = Pipeline([
    ('scaler', RobustScaler()),
    ('XGB', xgb_model)
])

xgb_clf.fit(str_cap_x_stacked, le_tr_y)

print('TRAIN')
str_y_pred = xgb_clf.predict(str_cap_x_stacked)
print(classification_report(le_tr_y, str_y_pred))

TRAIN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2442
           1       1.00      1.00      1.00     56527

    accuracy                           1.00     58969
   macro avg       1.00      1.00      1.00     58969
weighted avg       1.00      1.00      1.00     58969



In [None]:
print("")
print('VAL')
val_y_pred = xgb_clf.predict(val_x_stacked)
print(classification_report(le_val_y, val_y_pred))