### Data Loading and Label Mapping

In [None]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline
from googlesearch import search
from bs4 import BeautifulSoup
import requests
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import csv
import os
import time
from random import uniform, choice
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from googlesearch import search


In [None]:
df = pd.read_csv('Test_dataset(FINAL).csv')
df

In [None]:
df['Label'].value_counts()

In [None]:
df['Label'].value_counts().plot(kind='barh')

### Articles Pipeline

In [None]:
def get_top_links(query, num_links=1):
    linksgot = []
    try:
        # Perform Google search and get the top links
        search_results = search(query, num_results=num_links)

        # Print the top links
        for i, link in enumerate(search_results, start=1):
            linksgot.append(link)
    except Exception as e:
        print(f"An error occurred: {e}")
    return linksgot


def get_title_and_content(search_query_results):
    article_titles = []
    article_content = []
    if search_query_results:
        for results in search_query_results:
            try:
                # Send a request to the URL and get the HTML content
                response = requests.get(results)
                soup = BeautifulSoup(response.content, 'lxml')

                currentp = ""
                # Scrape <p> tags
                p_tags = soup.find_all('p')
                for p in p_tags:
                    currentp += p.text
                article_content.append(currentp)

                currenth1 = ""
                # Scrape <h1> tags
                h1_tags = soup.find_all('h1')
                for h1 in h1_tags:
                    currenth1 += h1.text

                article_titles.append(currenth1)

            except Exception as e:
                print(f"An error occurred: {e}")
    return article_titles, article_content


# Get the titles and contents
def make_data(search_query_results):
    titles, contents = get_title_and_content(
        get_top_links(search_query_results))

    # Create a pandas DataFrame
    data = {'Title': titles, 'Content': contents}
    df = pd.DataFrame(data)
    return df

In [None]:
#To scrape headlines for the FactCC endpoint
def dataframegen(text_input):
    scraped_df = make_data(text_input)
    scraped_df.dropna(inplace=True)
    return scraped_df

In [None]:
#Load FactCC Model pipeline
pipe = pipeline(model="manueldeprada/FactCC", task="text-classification", max_length=512)

In [None]:
def factCC(input_headline : str):
    scraped_df =  dataframegen(input_headline)

    # Convert the 'Content' column to strings
    scraped_df['Content'] = scraped_df['Content'].astype(str)

    # Sort the DataFrame based on the length of the strings in the 'Content' column
    scraped_df = scraped_df.sort_values(by='Content', key=lambda x: x.str.len(), ascending=False)

    # print(f"Title: {scraped_df['Content'][0][:100]}")

    if len(scraped_df) == 0 or not scraped_df['Content'][0] or '403 Forbidden' in scraped_df['Content'][0] or '403 Forbidden' in scraped_df['Title'][0] :
        #Could not retrieve articles related to headline, Could possibly be a false claim OR Scraper got blocked/forbidden
        return False

    scraped_content = (
    f"{scraped_df['Title'][0]} \n{scraped_df['Content'][0]}")
    
    # Perform text classification [source,claim]
    ans = pipe([[[scraped_content,input_headline]]], truncation=True, padding='max_length')

    if ans[0]['label'] == 'CORRECT':
        return True, scraped_content
    else:
        return False, scraped_content

In [None]:
model_decision = factCC(df['Headline'][0])
actual_decision = df['Label'][0]
print(model_decision,actual_decision)
if model_decision == actual_decision:
    print('Correct Decision')
else:
    print('Incorrect Decision')

In [None]:
df.describe()

In [None]:

# Assuming df and factCC are already defined

predictions = []
actuals = []
results = []
batch_size = 10

csv_filename = 'article_pipeline.csv'

# Delete the file if it already exists
if os.path.exists(csv_filename):
    os.remove(csv_filename)
    print(f"Existing {csv_filename} has been deleted.")

try:
    # Open the CSV file for writing
    with open(csv_filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Index', 'Headline','Scraped Content',  'Model Decision', 'Actual Decision'])
        
        # Iterate over each row in the dataframe with tqdm for progress
        for index, row in tqdm(df.iterrows(), total=len(df)):
            try:
                model_decision, scraped_content = factCC(row['Headline'])
                actual_decision = row['Label']
                predictions.append(model_decision)
                actuals.append(actual_decision)
                
                results.append([index, row['Headline'],scraped_content,model_decision, actual_decision])
                time.sleep(2)
                # Write to CSV in batches of 10
                if len(results) % batch_size == 0:
                    # print(f"Writing batch of {batch_size} to CSV...")
                    csvwriter.writerows(results)
                    csvfile.flush()  # Force write to disk
                    os.fsync(csvfile.fileno())  # Ensure it's written to disk
                    # print(f"Batch written. Current file size: {os.path.getsize(csv_filename)} bytes")
                    results = []
            
            except Exception as e:
                print(f"Error processing row {index}: {str(e)}")
        
        # Write any remaining results
        if results:
            # print(f"Writing final batch of {len(results)} to CSV...")
            csvwriter.writerows(results)
            csvfile.flush()
            os.fsync(csvfile.fileno())
            # print(f"Final batch written. Current file size: {os.path.getsize(csv_filename)} bytes")

    # Calculate the metrics
    accuracy = accuracy_score(actuals, predictions)
    precision = precision_score(actuals, predictions, pos_label=True)
    recall = recall_score(actuals, predictions, pos_label=True)
    f1 = f1_score(actuals, predictions, pos_label=True)

    # Append the accuracy to the CSV file
    with open(csv_filename, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([])  # Add an empty row for separation
        csvwriter.writerow(['Metric', 'Value'])
        csvwriter.writerow(['Accuracy', accuracy])
        csvwriter.writerow(['Precision', precision])
        csvwriter.writerow(['Recall', recall])
        csvwriter.writerow(['F1 Score', f1])

    print(f'Results written to {csv_filename}')
    print(f'Final file size: {os.path.getsize(csv_filename)} bytes')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    if os.path.exists(csv_filename):
        print(f"CSV file exists. Size: {os.path.getsize(csv_filename)} bytes")
    else:
        print("CSV file does not exist.")