# Wczytanie danych z ClearML


In [35]:
from google.colab import userdata


ClearML Template

In [36]:
!pip install clearml

from clearml import Task, Dataset

import pandas as pd
import os
import time


os.environ["CLEARML_API_ACCESS_KEY"] = userdata.get('a')
os.environ["CLEARML_API_SECRET_KEY"] = userdata.get('s')
os.environ["CLEARML_API_HOST"] = "https://api.clear.ml"
os.environ["CLEARML_WEB_HOST"] = "https://app.clear.ml"
os.environ["CLEARML_FILES_HOST"] = "https://files.clear.ml"



In [37]:
testDataPath = Dataset.get(dataset_id="a5958fdaf8cd449bbadc2bf57e5e797b").get_local_copy()
print(testDataPath)

/root/.clearml/cache/storage_manager/datasets/ds_a5958fdaf8cd449bbadc2bf57e5e797b


In [38]:
test_val_train_folder=os.listdir(testDataPath)

print(testDataPath+"/"+test_val_train_folder[0])
print(testDataPath+"/"+test_val_train_folder[1])
print(testDataPath+"/"+test_val_train_folder[2])

for file in test_val_train_folder:
    if "train" in file:
        train_file = file
    elif "valid" in file:
        valid_file = file
    elif "test" in file:
        test_file = file

/root/.clearml/cache/storage_manager/datasets/ds_a5958fdaf8cd449bbadc2bf57e5e797b/train_filtered.tsv
/root/.clearml/cache/storage_manager/datasets/ds_a5958fdaf8cd449bbadc2bf57e5e797b/valid_filtered.tsv
/root/.clearml/cache/storage_manager/datasets/ds_a5958fdaf8cd449bbadc2bf57e5e797b/test_filtered.tsv


In [39]:
# wczytujemy do DataFrame
columns = [
    "id", "label", "statement", "subjects", "speaker", "speaker_job", "state_info",
    "party_affiliation", "barely_true", "false", "half_true",
    "mostly_true", "pants_on_fire", "context"
]

df_train = pd.read_csv(f"{testDataPath}/{train_file}", sep='\t', names=columns, header=None)
df_valid = pd.read_csv(f"{testDataPath}/{valid_file}", sep='\t', names=columns, header=None)
df_test = pd.read_csv(f"{testDataPath}/{test_file}", sep='\t', names=columns, header=None)

print("train:", df_train.shape)
print("valid:", df_valid.shape)
print("test:", df_test.shape)


train: (3681, 14)
valid: (432, 14)
test: (461, 14)


In [40]:
df_test.head(1)

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job,state_info,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context
0,11972.json,True,Building a wall on the U.S.-Mexico border will take literally years.,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview


# Pre processing


### Importy

In [41]:
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Stop words

In [42]:
stop_words = set(stopwords.words('english'))

In [43]:
#2 wersja z użyciem spacy

# nlp = spacy.load('en_core_web_sm')

#można tez połączyć te dwie metody robiąc listę każdej i potem .concat()

### Czyszczenie tekstu

In [44]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove all digits from text
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'<[^>]+>', '', text)  # remove html tags
    text = re.sub(r'<[^>]*>', '', text)  # additional html tag removal
    emojis = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)  # find emojis
    text = re.sub(r'[\W]+', ' ', text) + ' '.join(emojis).replace('-', '')
    text = text.strip() # remove
    return text

### Lematyzacja

In [45]:
lemmatizer = WordNetLemmatizer()

### Stemming

In [46]:
porter = PorterStemmer()

### Tokenizacja + poprzednie funkcje

In [47]:
def custom_tokenizer(text, cfg):
    text = clean_text(text)
    tokens = word_tokenize(text)

    if cfg["lemmatization"]:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if cfg["stemming"]:
        tokens = [porter.stem(token) for token in tokens]
    if cfg["remove_stopwords"]:
        tokens = [t for t in tokens if t not in stop_words]
    tokens = [t for t in tokens if len(t) >= cfg["min_token_length"]]

    if cfg["concat_to_sentence"]:
        return " ".join(tokens)
    return tokens


# Metoda Embedding - SBERT

In [48]:
def embedding(df,cfg):
  if cfg['embeding_type']=='SBERT':
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df['content'].tolist())
    return embeddings

# Budowa grafu

In [49]:
def buildgraph(cfg,train_df):
  G = nx.Graph()

  #wierzchołki
  for i in range(len(train_df)):
    G.add_node(i, label=train_df.iloc[i]['target'])

  # Cosine similarity + threshold
  similarities = cosine_similarity(train_embeddings)
  threshold = cfg["cosine_treshold"]

  for i in range(len(train_df)):
      for j in range(i+1, len(train_df)):
          if similarities[i][j] >= threshold:
             G.add_edge(i, j, weight=similarities[i][j])
  return G

# Funkcja klasyfikacji

In [50]:
def predict(emb, G, train_embeddings):
    # 1. Znajdź najbardziej podobny wierzchołek
    sims = cosine_similarity([emb], train_embeddings)[0]
    top1_idx = sims.argmax()

    # 2. Weź jego sąsiadów w grafie
    neighbors = list(G.neighbors(top1_idx))

    voting_nodes = [top1_idx] + neighbors

    voting_labels = [G.nodes[n]['label'] for n in voting_nodes]


    true_count = 0

    for i in voting_labels:
      if i == True:
        true_count += 1

    if true_count > len(voting_labels) / 2:
        return 1
    else:
        return 0



# Wywołanie Taska

In [51]:
task_name = "GRAPH+SBERT"
task = Task.init(project_name="FakeNewsDetection", task_name=task_name)


cfg_preprocessing ={
    "tokenizer": True,
    "lemmatization": True,
    "stemming": False,
    "remove_stopwords": True,
    "min_token_length": 3, # wywala slowa krótsze niz 3
    "concat_to_sentence": True, # czy łaczymy w zdania (True) czy zostawimy jako wyrazy (False)
    "cosine_treshold" : 0.7,
    "embeding_type": 'SBERT',
}
task.connect(cfg_preprocessing)

# preprocessing
df_train = df_train.iloc[:,[1,2]]
df_test = df_test.iloc[:,[1,2]]

df_train.columns = ['target','content']
df_test.columns = ['target','content']

#czyszczenie tekstu
df_train['content'] = df_train['content'].apply(clean_text)
df_test['content'] = df_test['content'].apply(clean_text)

#tokenizacja
train_tokens = df_train['content'].apply(lambda x: custom_tokenizer(x, cfg_preprocessing))
test_tokens = df_test['content'].apply(lambda x: custom_tokenizer(x, cfg_preprocessing))

#embeding
train_embeddings = embedding(df_train, cfg_preprocessing)
test_embeddings = embedding(df_test, cfg_preprocessing)




# metoda
start_time = time.time()

# Budowa grafu i ewaluacja
G = buildgraph(cfg_preprocessing, df_train) # Pass df_train instead of train_embeddings

y_true = [1 if i  == True else 0 for i in df_test['target'].tolist()]
y_pred = [predict(emb, G, train_embeddings) for emb in test_embeddings]



end_time = time.time()
training_duration = end_time - start_time

# Raport
report = classification_report(y_true, y_pred, target_names=['FAKE', 'TRUE'],output_dict=True)

logger = task.get_logger()
logger.report_scalar("Precision", "Precision", value=report['weighted avg']['precision'],iteration=0)
logger.report_scalar("Recall", "Recall", value=report['weighted avg']['recall'],iteration=0)
logger.report_scalar("F1", "F1", value=report['weighted avg']['f1-score'],iteration=0)

print(classification_report(y_true, y_pred, target_names=['FAKE', 'TRUE']))
print(f"Training duration: {training_duration} seconds")
# Zamykanie taska
task.close()

ClearML Task: created new task id=39fc7dd37b28425b87e429cffaf2adcd
ClearML results page: https://app.clear.ml/projects/25cc2e9801f0421ba3bf3ef6bcb791c7/experiments/39fc7dd37b28425b87e429cffaf2adcd/output/log
              precision    recall  f1-score   support

        FAKE       0.62      0.67      0.64       250
        TRUE       0.57      0.52      0.54       211

    accuracy                           0.60       461
   macro avg       0.60      0.59      0.59       461
weighted avg       0.60      0.60      0.60       461

Training duration: 10.863515377044678 seconds
