In [7]:
import csv 
import pandas as pd
import numpy as np
import json
import re
from googletrans import Translator
from tqdm import tqdm
import requests


def libre_translate(text, source_lang='auto', target_lang='en'):
    """
    Translate text using the LibreTranslate API.
    
    :param text: Text to be translated
    :param source_lang: Source language (default is auto-detect)
    :param target_lang: Target language (default is English 'en')
    :return: Translated text
    """
    url = "https://libretranslate.de/translate"
    payload = {
        "q": text,
        "source": source_lang,
        "target": target_lang,
        "format": "text"
    }
    headers = {"Content-Type": "application/json"}
    
    response = requests.post(url, json=payload, headers=headers)
    
    if response.status_code == 200:
        result = response.json()
        return result['translatedText']
    else:
        return "Translation error"


In [3]:
def parse_csv(file_path):
    data = []
    
    # Open the CSV file and use csv.reader to parse it
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        
        for row in reader:
            id_ = int(row[0])
            original_sentence = row[1].split(',')[0].replace("(", "").replace("'", "")
            translation_sentence = row[1].split(', ')[1].replace("'", "") if len(row[1].split(', ')) > 1 else None
            language_tag = row[1].split(', ')[-2].replace("[('", "").replace("'", "")

            fact_check_link = row[2].split(', ')[-1].replace(")]", "").replace("[(", "").replace("'","") if len(row[1].split(', ')) > 2 else None
            data.append({
                "id": id_,
                "original_sentence": original_sentence,
                "translation_sentence": translation_sentence,
                "language": language_tag,
                "fact_check_link": fact_check_link
            })
    
    return data


In [3]:
file_path = 'fact_checks.csv'
data = parse_csv(file_path)

# Print the parsed data

with open('fact_checks.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(data, jsonfile, indent=4, ensure_ascii=False)


post_id,instances,ocr,verdicts,text

In [35]:
def clean_string(input_string):
    # This regular expression keeps only alphabets, numbers, and whitespaces
    input_string= re.sub(r'[^\w\s\(\[\]\'\.)]', '', input_string)
    cleaned_string = input_string.lower()
    # cleaned_string = re.sub(r'\s+', ' ', cleaned_string).strip()

    return cleaned_string

def parse_ocr_response(ocr_response):
    # Extract the main OCR text
    text_match = re.search(r"\[\(([^[(']]+)", ocr_response)
    text = text_match.group(1) if text_match else ""

    # Extract the language-probability pairs
    lang_prob_match = re.findall(r"\('(\w+)', ([\d\.]+)\)", ocr_response)
    languages_with_probabilities = [(lang, float(prob)) for lang, prob in lang_prob_match]
    
    return text, languages_with_probabilities



def translate_text(text, target_language='en'):
    # Initialize the Google Translator object
    translator = Translator()

    # Translate the text to the target language (English)
    translation = translator.translate(text, dest=target_language)
    
    return translation.text

def libre_translate(text, source_lang='auto', target_lang='en'):
    """
    Translate text using the LibreTranslate API.
    
    :param text: Text to be translated
    :param source_lang: Source language (default is auto-detect)
    :param target_lang: Target language (default is English 'en')
    :return: Translated text
    """
    url = "https://libretranslate.de/translate"
    payload = {
        "q": text,
        "source": source_lang,
        "target": target_lang,
        "format": "text"
    }
    headers = {"Content-Type": "application/json"}
    
    response = requests.post(url, json=payload, headers=headers)
    
    if response.status_code == 200:
        result = response.json()
        return result['translatedText']
    else:
        return "Translation error"



def parse_facts_csv(file_path):
    data = []
    
    # Open the CSV file and use csv.reader to parse it
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        cnt=0
        for row in tqdm(reader):
            cnt+=1
            if(cnt==10):
                break
       
            id_ = int(row[0])
            instance = row[1]
            ocr = row[2]
            verdicts = row[3]
            text = row[4]
            # text = clean_string(text)
            # print(ocr)
            # ocrl,prob = parse_ocr_response(ocr)
            cnt =1
            try:
                translated_ocr =ocr.split(", \"")[1]
                if(translated_ocr!=""):
                    cnt=0
            except:
                translated_ocr = None 
            if cnt!=0:
                try:
                    a = 2/cnt
                    translated_ocr = ocr.split(", \'\"")[1]
                    if(translated_ocr!=""):
                        cnt=0
                
                except:
                    translated_ocr = None
            if cnt!=0:
                try:
                    a = 2/cnt
                    translated_ocr = ocr.split("\'")[3]
                    if(translated_ocr!=""):
                        cnt=0
                    else:
                        a = 2/0
                
                except:
                    translated_ocr = ocr.split(", \'\"")[0]
            ocr = clean_string(ocr)
            translated_ocr = clean_string(translated_ocr)
        

            
            data.append({
                "id": id_,
                "instance": instance,
                "ocr": ocr, 
                "translated_ocr":translated_ocr,
                "verdicts":verdicts,
                "text":text
            })
    
    return data


data_post = parse_facts_csv("posts.csv")

with open('posts.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(data_post, jsonfile, indent=4, ensure_ascii=False)



24431it [00:00, 67820.88it/s]
