# Setting up virtual environment

Install Mambaforge https://mamba.readthedocs.io/en/latest/installation.html
If you're on MacOS you can run `brew install mambaforge`

Create a virtual environment for the project
`mamba create -n fakenews python=3.10`

Activate the env `mamba activate fakenews`

Install dependencies we'll need for the project
`mamba install -c huggingface transformers=4.26.0 datasets evaluate jupyterlab scikit-learn`



Run jupyterlab:
`jupyter lab`

In [13]:
conda install pytorch

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Erick\anaconda3\envs\fakenews

  added / updated specs:
    - pytorch


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    future-0.18.2              |  py310haa95532_1         672 KB
    libuv-1.40.0               |       he774522_0         255 KB
    ninja-1.10.2               |       haa95532_5          14 KB
    ninja-base-1.10.2          |       h6d14046_5         255 KB
    pytorch-1.12.1             |cpu_py310h5e1f01c_0        80.7 MB
    ------------------------------------------------------------
                                           Total:        81.9 MB

The following NEW packages will be INSTALLED:

  future             pkgs/main/win-64::future-0.18.2-py310haa95532_1
  libuv              pkgs/main/win-64::libuv



  current version: 4.12.0
  latest version: 23.1.0

Please update conda by running

    $ conda update -n base -c defaults conda




In [1]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

ModuleNotFoundError: No module named 'torch'

In [1]:
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("liar")

Found cached dataset liar (C:/Users/Erick/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514)
100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 251.43it/s]


In [3]:
fake_csv = load_dataset("csv", data_files="news.csv")

Using custom data configuration default-16166e5dc31fa63c
Found cached dataset csv (C:/Users/Erick/.cache/huggingface/datasets/csv/default-16166e5dc31fa63c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 125.26it/s]


In [4]:
fake_csv

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 6335
    })
})

In [5]:
fake_csv_split = fake_csv["train"].train_test_split(test_size=0.1)

In [6]:
fake_csv_split

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 5701
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 634
    })
})

In [7]:
from transformers import AutoTokenizer

# Load DistilBERT tokenizer and tokenize (encode) the texts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
fake_csv_split = fake_csv_split.map(lambda x: {"label": 1 if x["label"] == "FAKE" else 0})

100%|████████████████████████████████████████████████████████████████████████████| 5701/5701 [00:01<00:00, 5433.81ex/s]
100%|██████████████████████████████████████████████████████████████████████████████| 634/634 [00:00<00:00, 6832.71ex/s]


In [9]:
from transformers import AutoTokenizer
def tokenize(batch):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    return tokenized_batch

In [10]:
fake_csv_split.map(tokenize, remove_columns=['Unnamed: 0', 'title', 'text'], batched=True, num_proc=4)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5701
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 634
    })
})

In [11]:
# Tokenize and encode the dataset


dataset_enc = fake_csv_split.map(tokenize, remove_columns=['Unnamed: 0', 'title', 'text'], batched=True, num_proc=4)

# Set dataset format for PyTorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Check the output
print(dataset_enc["train"].column_names)

Loading cached processed dataset at C:\Users\Erick\.cache\huggingface\datasets\csv\default-16166e5dc31fa63c\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-50286481a9f1f581.arrow
Loading cached processed dataset at C:\Users\Erick\.cache\huggingface\datasets\csv\default-16166e5dc31fa63c\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-b1870c906afc97a0.arrow
Loading cached processed dataset at C:\Users\Erick\.cache\huggingface\datasets\csv\default-16166e5dc31fa63c\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-a4dc9cc34e37af29.arrow
Loading cached processed dataset at C:\Users\Erick\.cache\huggingface\datasets\csv\default-16166e5dc31fa63c\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-0e9cd019af4684c8.arrow
Loading cached processed dataset at C:\Users\Erick\.cache\huggingface\datasets\csv\default-16166e5dc31fa63c\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e

['label', 'input_ids', 'attention_mask']


In [12]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Instantiate a data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

# Load model from checkpoint\n",
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                           num_labels=2)

Downloading (…)"pytorch_model.bin";: 100%|██████████████████████████████████████████| 268M/268M [00:16<00:00, 16.0MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassif

In [14]:
model.to("mps")

RuntimeError: PyTorch is not linked with support for mps devices

In [15]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer",  evaluation_strategy="epoch")

In [16]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_enc["train"],
    eval_dataset=dataset_enc["test"],
    compute_metrics=compute_metrics,
)

In [1]:
import json
from bs4 import BeautifulSoup
import requests
import lxml.html as lx

In [2]:
url = "https://www.foxnews.com/politics/ahead-biden-state-union-address-country-dissatisfied-multiple-crises"
response = requests.get(url)

In [3]:
response.raise_for_status

<bound method Response.raise_for_status of <Response [200]>>

In [4]:
text = response.text

In [21]:
soup = BeautifulSoup(response.content, "html.parser")
headline = soup.find_all(class_ = "headline")[0]
headline = str(headline)
headline_clean = headline.replace('<h1 class="headline">',"").replace("</h1>","")
headline_clean

'Ahead of Biden State of the Union address, country dissatisfied with state of the union after multiple crises'

In [98]:
text2 = soup.find_all(class_= "article-body")


In [65]:
html = lx.fromstring(response.text)
html

<Element html at 0x1e975523680>

In [66]:
string = html.xpath('//p')

In [67]:
len(string)
            


32

In [95]:
lst  = []   
for i in range(0, len(string)):
    article_part = html.xpath('//p')[i].text_content()
    article_part = str(article_part)
    article_part = article_part.replace("\n ","").replace("  ", "").replace("\'" , " ").replace("\xa0","")
    lst.append(article_part)           

In [156]:
#lst

In [97]:
article_full = "".join(lst)
article_full

' This material may not be published, broadcast, rewritten, or redistributed. ©2023 FOX News Network, LLC. All rights reserved. Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided by Factset. Powered and implemented by FactSet Digital Solutions. Legal Statement. Mutual Fund and ETF data provided by Refinitiv Lipper. Former House Speaker Newt Gingrich joins  Fox & Friends  to discuss President Biden s political future and performance in office ahead of his State of the Union Address.President Biden will have a tough audience among the American people on Tuesday night when he delivers the annual State of the Union address.Speaking from the Capitol, Biden is expected to make the case to the American people in his second State of the Union speech that the country is in a strong and prosperous position now and moving forward.New polling finds that high percentages of the American people feel financially worse off since Biden took office and are unhappy with

In [110]:
url = "https://link.h-cdn.com/get"
header = {'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'}
response = requests.get(url)
response.raise_for_status

<bound method Response.raise_for_status of <Response [403]>>

In [115]:
response = requests.get("https://link.h-cdn.com/get", params = {
    "Host": "link.h-cdn.com",
    "Origin" : "https://www.foxnews.com",
    })

response.raise_for_status

<bound method Response.raise_for_status of <Response [404]>>

In [117]:
response = requests.get("https://www.foxnews.com/politics")
response.raise_for_status

<bound method Response.raise_for_status of <Response [200]>>

In [155]:
#response.text

In [157]:
html = lx.fromstring(response.text)
lst_links = html.xpath('//h4/a/@href')
for i in range(0,len(lst_links)):
    if '/politics/' not in  lst_links[i]:
        lst_links[i] = ''
    else:
        continue



In [158]:
for j in lst_links[:]:
    if j == '':
        lst_links.remove(j)
lst_links

['/politics/faa-closes-airspace-montana-support-department-defense-activities',
 '/politics/dems-cite-ginni-thomas-mitch-mcconnell-reasons-impose-code-conduct-supreme-court-justices',
 '/politics/alaska-sen-murkowski-us-must-send-message-we-dont-tolerate-violation-us-airspace',
 '/politics/washington-post-corrects-comical-story-gop-rep-anna-paulina-luna-second-time',
 '/politics/johnson-warns-left-infiltrated-major-us-institutions-gop-warns-weaponization-of-government',
 '/politics/pentagon-says-us-detected-third-flying-object-alaska-day-shooting-canada',
 '/politics/republicans-react-third-object-being-shot-canada-unprecedented-challenge',
 '/politics/dems-cite-ginni-thomas-mitch-mcconnell-reasons-impose-code-conduct-supreme-court-justices',
 '/politics/china-mocks-biden-knocking-down-balloons-fighter-jets-hysterical-laughably-juvenile',
 '/politics/border-patrol-nabs-illegal-immigrants-smuggler-spotting-suv-driving-erratically-shredded-tire',
 '/politics/money-talks-rumored-president