# Zbiór treningowy używany jest do stworzenia grafu a testowy do zapytań
# Przy testowaniu metod podobienstwa zmienic je mozna w grafie i metodzie do predykcji

#Importy

In [37]:
import pandas as pd
import numpy as np
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from string import punctuation
import re


import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


from sentence_transformers import SentenceTransformer

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity


from sklearn.metrics import classification_report


#Ładowanie danych i pozostawienie tylko contentu i target(true/false)

In [20]:
train_df = pd.read_csv('train_filtered.tsv',sep='\t')
test_df = pd.read_csv('/content/test_filtered.tsv',sep='\t' )

train_df = train_df.iloc[:,[1,2]]
test_df = test_df.iloc[:,[1,2]]

train_df.columns = ['target','content']
test_df.columns = ['target','content']

for i in [1,10,100,1000]:
  print(train_df['content'][i])

The Chicago Bears have had more starting quarterbacks in the last 10 years than the total number of tenured (UW) faculty fired during the last two decades.
Austin is a city that has basically doubled in size every 25 years or so since it was founded.
Says he never said he would keep education funding the same.
For more than 30 years after World War II, there was a steady reduction in U.S. debt as a percentage of gross domestic product.


In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   target   3680 non-null   bool  
 1   content  3680 non-null   object
dtypes: bool(1), object(1)
memory usage: 32.5+ KB


In [22]:
train_df.head()

Unnamed: 0,target,content
0,False,Health care reform legislation is likely to ma...
1,True,The Chicago Bears have had more starting quart...
2,False,When Mitt Romney was governor of Massachusetts...
3,True,"""McCain opposed a requirement that the governm..."
4,False,Women and men both are making less when you ad...


# Czyszczenie tekstu

In [23]:
def clean_text(text):
    temp = text.lower()
    temp = re.sub('\d', '', temp)
    temp = re.sub('<[^>]*>', '', temp)
    emojis = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', temp)
    temp = re.sub('[\W]+', ' ', temp) + ' '.join(emojis).replace('-', '')
    temp = re.sub('[{}]'.format(punctuation), '', temp)
    temp = temp.strip()
    return temp

train_df['content']=train_df['content'].apply(clean_text)
test_df['content']=test_df['content'].apply(clean_text)


for i in [1,10,100,1000]:
  print(train_df['content'][i])

the chicago bears have had more starting quarterbacks in the last years than the total number of tenured uw faculty fired during the last two decades
austin is a city that has basically doubled in size every years or so since it was founded
says he never said he would keep education funding the same
for more than years after world war ii there was a steady reduction in u s debt as a percentage of gross domestic product


# Dalszy preprocesing

In [86]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# text->token->-stopowrds->lemantyzacja/stemming(raczej użyc jednego)->text
def preprocesing(text):
  tokens = nltk.word_tokenize(text)

  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  #lemmatizer = WordNetLemmatizer()
  #tokens = [lemmatizer.lemmatize(word) for word in tokens]

  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]

  temp = ' '.join(tokens)

  # Usuwanie dodatkowych spacji
  temp = re.sub(r'\s+', ' ', temp).strip()

  return temp

train_df['content']=train_df['content'].apply(preprocesing)
test_df['content']=test_df['content'].apply(preprocesing)

for i in [1,10,100,1000]:
  print(train_df['content'][i])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


chicago bear start quarterback last year total number tenur uw faculti fire last two decad
austin citi basic doubl size everi year sinc found
say never said would keep educ fund
year world war ii steadi reduct u debt percentag gross domest product


# Embeding

In [87]:
#uproszczony BERT oraz w wersji dla zdan SBERT - embeding liczony dla zdania
model = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = model.encode(train_df['content'].tolist(), show_progress_bar=True)
test_embeddings = model.encode(test_df['content'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/115 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [88]:
print(train_embeddings)
print(test_embeddings)

[[-0.07421422  0.08181737 -0.04104767 ... -0.11797588  0.03728567
  -0.00047243]
 [ 0.00606985 -0.03707303 -0.04901601 ... -0.04378171  0.03483156
  -0.03031084]
 [-0.05112187 -0.04253879  0.09698122 ... -0.03999893  0.03693427
   0.0420335 ]
 ...
 [-0.02296975  0.13573018  0.07294773 ... -0.03684257 -0.01168303
   0.04074784]
 [-0.06762283  0.02258461  0.02360234 ... -0.08929332 -0.05846737
   0.02759581]
 [-0.01471788  0.05923892  0.00274643 ... -0.04983423  0.01120474
  -0.07089802]]
[[-0.08768135  0.02064923 -0.0145565  ... -0.08786563 -0.0129094
  -0.05189918]
 [ 0.02313784  0.11372625 -0.01650107 ... -0.01663158  0.05880687
  -0.03697668]
 [-0.04170997  0.01226357  0.0523757  ... -0.04361208  0.05641093
  -0.00990743]
 ...
 [-0.00174266  0.10170395  0.02003054 ... -0.01035058  0.0349028
   0.07595128]
 [ 0.11687487 -0.0585607  -0.01706447 ... -0.03287379 -0.00528191
   0.02340796]
 [ 0.00942336  0.00499753  0.06510504 ... -0.04689918  0.06042946
   0.00768124]]


# Budowa grafu

In [89]:

G = nx.Graph()

#wierzchołki
for i in range(len(train_df)):
    G.add_node(i, label=train_df.iloc[i]['target'])

# Cosine similarity + threshold
similarities = cosine_similarity(train_embeddings)
threshold = 0.7

for i in range(len(train_df)):
    for j in range(i+1, len(train_df)):
        if similarities[i][j] >= threshold:
            G.add_edge(i, j, weight=similarities[i][j])

# Predykcja

In [90]:
#przyjmuje embeding, szuka wszystkich sasiadow sasiadow i na podstawie wiekszosci wyznacza t/f

def predict(emb, G, train_embeddings):
    # 1. Znajdź najbardziej podobny wierzchołek
    sims = cosine_similarity([emb], train_embeddings)[0]
    top1_idx = sims.argmax()

    # 2. Weź jego sąsiadów w grafie
    neighbors = list(G.neighbors(top1_idx))



    voting_nodes = [top1_idx] + neighbors

    voting_labels = [G.nodes[n]['label'] for n in voting_nodes]


    true_count = 0

    for i in voting_labels:
      if i == True:
        true_count += 1


    if true_count > len(voting_labels) / 2:
        return 1
    else:
        return 0


print(predict(test_embeddings[1], G, train_embeddings))


[]
0


#Ewaluacja

In [91]:
from sklearn.metrics import classification_report

y_true = [1 if i  == True else 0 for i in test_df['target'].tolist()]
y_pred = [predict(emb, G, train_embeddings) for emb in test_embeddings]

print(y_pred)

print(classification_report(y_true, y_pred, target_names=['FAKE', 'TRUE']))

print(pd.Series(y_true).value_counts())

[3666]
[]
[]
[]
[677, 2815, 3134]
[]
[1922]
[]
[2148]
[1279, 3412]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[641]
[]
[1148]
[]
[3431]
[1924]
[96]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[1415, 2040, 2561]
[]
[]
[]
[]
[]
[]
[12, 600]
[]
[]
[893, 3069]
[]
[]
[]
[]
[]
[]
[756, 1197]
[]
[]
[]
[]
[211]
[]
[1861]
[]
[1885, 3504, 3567]
[]
[]
[]
[]
[]
[]
[2548]
[]
[]
[2023, 3434]
[]
[]
[]
[]
[1970]
[2548]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[1628]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2911]
[]
[]
[]
[]
[]
[]
[]
[]
[2752, 2866]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2642, 2776]
[1148]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[1132]
[]
[]
[]
[]
[]
[]
[910]
[]
[]
[]
[]
[]
[1236]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[3451]
[]
[]
[]
[]
[]
[]
[]
[2264]
[]
[2437]
[]
[]
[3098]
[]
[]
[2371]
[]
[2148]
[]
[]
[]
[]
[]
[908, 2497]
[]
[]
[]
[]
[]
[]
[599]
[]
[]
[]
[]
[]
[]
[]
[1737, 3592]
[433]
[1648]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[690, 3041]
[3572]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[