In [None]:
import textstat
from goose3 import Goose
from openpyxl import Workbook
from openpyxl import load_workbook
import pandas as pd
import glob
import os
import re
import numpy as np

In [3]:
def is_readable(sentence):
    """
    Check if the sentence is readable using Flesch Reading Ease score.
    A higher score indicates easier readability.
    Args:
        sentence (str): The sentence to check.
    Returns:
        bool: True if the sentence is readable, False otherwise.
    """
    score = textstat.flesch_reading_ease(sentence)
    # Assuming a threshold of 30 for readability. Set by trial and error.
    return 30 < score

def scrape_website_with_goose3(url=None, input_text=None, output_file='scraped_content.xlsx'):
    """
    Scrape a website using Goose3 and save the content to an Excel file.
    Args:
        url (str): The URL of the website to scrape.
        input_text (str): The input text to process if no URL is provided.
        output_file (str): The name of the output Excel file.
    Returns:
        None
    """

    g = Goose()
    if url:
        article = g.extract(url=url)
        title = article.title
        content = article.cleaned_text
    elif input_text:
        content = input_text  # If input text is provided, use it directly
    else:
        print("No URL or input text provided.")
        return

    # Split the content into sentences
    sentences = content.split('.')

    # Create a new Excel workbook and sheet
    wb = Workbook()
    ws = wb.active
    ws.title = "Scraped Content"

    # Initialize row number
    row = 1

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and is_readable(sentence):  # Ensure the sentence is not empty and is readable
            ws.cell(row=row, column=1, value=sentence)
            row += 1

    # Save the workbook to a file
    wb.save(output_file)
    print(f"Content has been scraped and saved to {output_file}")

# Example usage:
# scrape_website_with_goose3(url='https://en.wikipedia.org/wiki/Garri#Variations')


In [4]:
def merge_files(country_name, folder_path):
    """
    Merge all Excel files for a specific country into one file.
    Args:
        country_name (str): The name of the country to filter files.
        folder_path (str): The path to the folder containing the files.
    Returns:
        None
    """
    # Define the pattern to match files for the specific country
    pattern = os.path.join(folder_path, f'{country_name}_*.xlsx')
    files = glob.glob(pattern)
    
    if not files:
        print(f"No files found for {country_name}")
        return
    
    # Initialize a list to hold dataframes
    dataframes = []
    
    # Read each file and append the dataframe to the list
    for file in files:
        df = pd.read_excel(file,header=None)
        dataframes.append(df)
    # Concatenate all dataframes into one
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the merged file
    merged_file_path = os.path.join(folder_path+"/Positive_Labels", f'{country_name}.xlsx')
    merged_df.to_excel(merged_file_path, index=False)
    
    # Optionally delete previous files
    for file in files:
        os.remove(file)
        print(f"Deleted {file}")
    


In [5]:
def clean_and_sort_sentences_in_directory(directory):
    """
    Clean and sort sentences in all Excel files in the specified directory.
    Args:
        directory (str): The path to the directory containing Excel files.
    Returns:
        None
    """
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.xlsx'):
            workbook_path = os.path.join(directory, filename)
            print(f"Processing {workbook_path}...")
            clean_and_sort_sentences(workbook_path, 'Sheet1')  # Assuming 'Sheet1' is the sheet name

def clean_and_sort_sentences(workbook_path, sheet_name,to = None,threshold = 0.8):
    """
    Clean and sort sentences in the specified Excel file.
    Args:
        workbook_path (str): The path to the Excel file.
        sheet_name (str): The name of the sheet to process.
        to (str): The path to save the cleaned file. If None, overwrite the original file.
        threshold (float): Threshold for sensibility check.
    Returns:
        int: The number of sentences processed.
    """
    # Load the workbook and the specific sheet
    wb = load_workbook(workbook_path)
    sheet = wb[sheet_name]
    file_name = workbook_path.split("/")[-1]
    if to is None:
        to = workbook_path
    else:
        to = to+"/"+file_name
    # Extract all sentences from the 1st column, skipping the header
    sentences = []
    for row in sheet.iter_rows(min_row=1, min_col=1, max_col=1):
        cell_value = row[0].value
        if cell_value and isinstance(cell_value, str):
            sentence = cell_value.strip()
            sentence = re.sub(r'^[\u2022•*\-><–.`\'\s]+', '', sentence)
            sentence = re.sub(r'^[\[\(]\d+[\]\)]\s*', '', sentence)
            sentence = sentence.replace('\n', ' ')
            sentence = sentence.strip()
            if len(sentence)>20:
                sentences.append(sentence)
    
    # Remove empty rows and sort sentences by length
    seen = set()
    sentences = [x for x in sentences if not (x in seen or seen.add(x))]
    sentences = sorted(filter(lambda x: x, sentences), key=len)
    print(len(sentences))
    # Clear the existing data in the 1st column
    for row in sheet.iter_rows(min_row=1, min_col=1, max_col=1):
        row[0].value = None
    
    # Write sorted sentences back to the 1st column
    for index, sentence in enumerate(sentences, start=1):
        sheet.cell(row=index, column=1).value = sentence
    
    # Save the workbook in place of the old one
    wb.save(to)
    print(f"Workbook '{workbook_path}' has been cleaned and sorted.")
    return len(sentences)

def remove_common_pair(read_from,check_from):
    """
    Remove common pairs from two Excel files.
    Args:
        read_from (str): Path to the first Excel file.
        check_from (str): Path to the second Excel file.
    Returns:
        None
    """
    rd_data = pd.read_excel(read_from,header=None)
    ch_data = pd.read_excel(check_from,header=None)
    rd_data = rd_data[0].values
    ch_data = ch_data[0].values
    for i in range(len(rd_data)):
        for j in range(len(ch_data)):
            if rd_data[i] == ch_data[j]:
                ch_data[j] = None
    ch_data = [x for x in ch_data if x is not None]
    df = pd.DataFrame(ch_data)
    df.to_excel(check_from,index=False,header=False)

def give_from_to(read_from,check_from):
    """
    Append sentences containing 'india' from one Excel file to another.
    This special pre processing function is required since data for India was scattered in other files (especially japan)
    due to incorrect pre-processing in the earlier version of eticor.
    Args:
        read_from (str): Path to the first Excel file.
        check_from (str): Path to the second Excel file.
    Returns:
        None
    """
    rd_data = pd.read_excel(read_from,header=None)
    ch_data = pd.read_excel(check_from,header=None)
    rd_data = rd_data[0].values
    ch_data = ch_data[0].values
    print(len(rd_data),len(ch_data))
    for i in range(len(rd_data)):
        if 'india' in rd_data[i].lower():
            ch_data = np.append(ch_data,rd_data[i])
    df = pd.DataFrame(ch_data)
    df.to_excel(check_from,index=False,header=False)
    

In [6]:
def scrape_urls_from_file(txt_file,name="scraped_content"):
    """
    Scrape websites listed in a .txt file and save the content to separate Excel files.
    Args:
        txt_file (str): Path to the .txt file containing URLs.
        name (str): Base name for the output Excel files.
    Returns:
        None
    """
    # Open the .txt file and read all the URLs
    with open(txt_file, 'r') as file:
        urls = file.readlines()

    # Loop through each URL in the list
    for i, url in enumerate(urls):
        url = url.strip()  # Remove any leading/trailing whitespace/newline characters
        if url:
            try:
                # Generate an output file name based on the URL or index
                output_file = f"{name}_{i+1}.xlsx"
                scrape_website_with_goose3(url=url, output_file=output_file)
                print(f"Content from {url} has been scraped and saved to {output_file}")
            except Exception as e:
                print(f"Failed to scrape {url}: {str(e)}")

In [7]:
def process_country_files(directory):
    """
    Process all .txt files in the specified directory.
    Args:
        directory (str): The path to the directory containing .txt files.
    Returns:
        list: A list of country names extracted from the file names.
    """
    # List all files in the directory
    files = os.listdir(directory)
    
    # Filter the files to get only those matching the 'country.txt' pattern
    txt_files = [f for f in files if f.endswith('.txt')]
    country_names = [os.path.splitext(txt_file)[0] for txt_file in txt_files]
    # Process each country file
    for txt_file in txt_files:
        country_name = os.path.splitext(txt_file)[0]  # Extract country name from the file name
        txt_path = os.path.join(directory, txt_file)
        
        # Call scrape_urls_from_file function with the txt file and country name
        scrape_urls_from_file(txt_path, country_name)
    return country_names

In [9]:
remove_common_pair("/home/siddhant_singh/lab_gpu/Positive_Labels/INDIA.xlsx","/home/siddhant_singh/lab_gpu/Positive_Labels/JAP.xlsx")
give_from_to("/home/siddhant_singh/lab_gpu/Positive_Labels/JAP.xlsx","/home/siddhant_singh/lab_gpu/Positive_Labels/INDIA.xlsx")

In [10]:
total_size = 0
file_paths = os.listdir("/home/siddhant_singh/lab_gpu/Final_Positive_Labels")
print(file_paths)
for file_path in file_paths:
    try:
        total_size += clean_and_sort_sentences(f"/home/siddhant_singh/lab_gpu/Final_Positive_Labels/{file_path}", "Sheet1",threshold = 0.5)
    except:
        total_size += clean_and_sort_sentences(f"/home/siddhant_singh/lab_gpu/Final_Positive_Labels/{file_path}", "in",threshold = 0.5)
print(total_size)

['NE.xlsx', 'INDIA.xlsx', 'LA.xlsx', 'MEA.xlsx', 'EA.xlsx']
6607
Workbook '/home/siddhant_singh/lab_gpu/Final_Positive_Labels/NE.xlsx' has been cleaned and sorted.
2189
Workbook '/home/siddhant_singh/lab_gpu/Final_Positive_Labels/INDIA.xlsx' has been cleaned and sorted.
3487
Workbook '/home/siddhant_singh/lab_gpu/Final_Positive_Labels/LA.xlsx' has been cleaned and sorted.
6968
Workbook '/home/siddhant_singh/lab_gpu/Final_Positive_Labels/MEA.xlsx' has been cleaned and sorted.
5800
Workbook '/home/siddhant_singh/lab_gpu/Final_Positive_Labels/EA.xlsx' has been cleaned and sorted.
25051
