In [None]:
#! pip install t5
#import requests
#
#subject = input("What is your topic?")

#url = 'https://en.wikipedia.org/w/api.php'
#params={
#    'action':'query',
#    'format':'json',
#    'titles': subject,
#    'prop':'extracts',
#    'exintro':True,
#    'explaintext': True,
#}
#response = requests.get(url, params=params)
#data = response.json()
#
#page = next(iter(data['query']['pages'].values()))
#
#print(page['extract'][:])





In [None]:
import csv
import requests
import os

def get_first_column_set_to_txt(csv_file_path, output_txt_path):
    try:
        # Create an empty set to store unique values
        first_column_set = set()
        
        # Open and read the CSV file
        with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
            csv_reader = csv.reader(file)
            
            # Iterate through each row and add first column value to set
            for row in csv_reader:
                if row:  # Check if row is not empty
                    first_column_set.add(row[0])
        
        # Write the set to a text file
        with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
            for item in first_column_set:
                txt_file.write(f"{item}\n")  # Write each item on a new line
                
        print(f"Set successfully written to '{output_txt_path}'")
        return first_column_set
    
    except FileNotFoundError:
        print(f"Error: File '{csv_file_path}' not found.")
        return set()
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return set()

def search_and_fetch_summary(topic):
    """Search Wikipedia for a topic and return parsed HTML summary of the best match (index 0)."""
    search_url = 'https://en.wikipedia.org/w/api.php'

    search_params = {
        'action': 'query',
        'format': 'json',
        'list': 'search',
        'utf8': 1,
        'srsearch': topic
    }

    try:
        search_data = requests.get(search_url, params=search_params, timeout=5).json()
        search_results = search_data.get('query', {}).get('search', [])
        if not search_results:
            return f"No Wikipedia article found for topic: {topic}"

        best_match_title = search_results[0]['title']

        parse_params = {
            'action': 'parse',
            'format': 'json',
            'page': best_match_title,
            'prop': 'text',
            'redirects': ''
        }

        response = requests.get(search_url, params=parse_params, timeout=5).json()
        raw_html = response['parse']['text']['*']
        document = html.document_fromstring(raw_html)

        text = ''
        for p in document.xpath('//p'):
            text += p.text_content() + '\n'

        return text.strip() if text.strip() else f"No extractable summary found for {best_match_title}."

    except requests.RequestException as e:
        return f"Request error while fetching summary for {topic}: {e}"
    except Exception as e:
        return f"Unexpected error for {topic}: {e}"

def load_processed_topics(filename):
    """Loads previously processed topics from a file."""
    if os.path.exists(filename):
        try:
            with open(filename, "r", encoding="utf-8") as file:
                return set(line.strip() for line in file if line.strip())
        except Exception as e:
            print(f"Error loading processed topics from '{filename}': {e}")
    return set()

def save_processed_topic(filename, topic):
    """Saves a topic to the processed topics file."""
    try:
        with open(filename, "a", encoding="utf-8") as file:
            file.write(topic + "\n")
    except Exception as e:
        print(f"Error saving topic '{topic}': {e}")

def write_to_sasv(filename, data_list):
    """Writes a list of unique Wikipedia summaries to a .sasv file, separating entries with ' /* '."""
    full_filename = f"{filename}.sasv"
    try:
        with open(full_filename, "a", encoding="utf-8") as file:
            file.write(" /* ".join(data_list) + " /* ")
    except Exception as e:
        print(f"Error writing to '{full_filename}': {e}")

def process_csv(csv_filename, sasv_filename, processed_filename, txt_filename):
    """Processes the first column of a CSV file, fetches Wikipedia summaries, and stores them uniquely."""
    # Get the set of unique topics from the CSV file
    unique_topics = get_first_column_set_to_txt(csv_filename, txt_filename)
    if not unique_topics:
        print("No topics found in the CSV file.")
        return

    processed_topics = load_processed_topics(processed_filename)
    new_summaries = []

    for topic in unique_topics:
        if topic and topic not in processed_topics:
            summary = search_and_fetch_summary(topic)
            if summary:
                new_summaries.append(summary)
                save_processed_topic(processed_filename, topic)

    if new_summaries:
        write_to_sasv(sasv_filename, new_summaries)
        print(f"Stored {len(new_summaries)} new articles in {sasv_filename}.sasv")
    else:
        print("No new unique topics found.")

# Call the main function
process_csv("csv/output.csv", "wikiArticles", "unique_topics.txt", "unique_topics.txt")


Stored 9 new articles in wikiArticles.sasv


In [None]:
import csv
#untested
def create_t5_triplets(csv_filename):
    """
    Creates T5-style triplets for summarization based on the input CSV.
    Each triplet consists of (Input, Target), where:
    - Input is the task with the article content.
    - Target is the corresponding summary.

    Args:
    - csv_filename (str): The path to the CSV file that contains the article and its summary.

    Returns:
    - list: A list of triplets in the form of (Input, Target).
    """
    triplets = []
    
    try:
        with open(csv_filename, newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip the header row
            
            for row in reader:
                if row and len(row) >= 2:
                    article_content = row[0]  # Wikipedia article content (Column 1)
                    summary = row[1]  # Summary (Column 2)
                    
                    # Creating the triplet
                    input_triplet = f"Summarize: {article_content}"
                    target_triplet = summary
                    
                    # Append the triplet to the list
                    triplets.append((input_triplet, target_triplet))
        
        print(f"Created {len(triplets)} triplets successfully.")
        return triplets
    
    except FileNotFoundError:
        print(f"Error: File '{csv_filename}' not found.")
        return []
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []

# Example usage
csv_file = "csv/output.csv"  
triplets = create_t5_triplets(csv_file)

# Print the first few triplets for confirmation
for i, (input_t, target_t) in enumerate(triplets[:5]):
    print(f"Triplet {i+1} - Input: {input_t}\nTarget: {target_t}\n")


In [None]:
import csv
#untested
def sort_csv_by_category(csv_filename, sorted_csv_filename):
    """
    Sorts the CSV file by the category in the first column and saves the result to a new file.
    
    Args:
    - csv_filename (str): The path to the original CSV file.
    - sorted_csv_filename (str): The path where the sorted CSV file will be saved.
    
    Returns:
    - bool: True if sorting and saving was successful, False if there was an error.
    """
    try:
        # Read the CSV file
        with open(csv_filename, newline='', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            rows = list(reader)
        
        # Skip the header
        header = rows[0]
        data_rows = rows[1:]
        
        # Sort the rows based on the first column (category)
        data_rows.sort(key=lambda x: x[0])  # Sorting by the first column (category)
        
        # Write the sorted rows into a new CSV file
        with open(sorted_csv_filename, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(header)  # Write the header
            writer.writerows(data_rows)  # Write the sorted rows
            
        print(f"CSV file sorted and saved to '{sorted_csv_filename}' successfully.")
        return True
    
    except FileNotFoundError:
        print(f"Error: File '{csv_filename}' not found.")
        return False
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return False

# Example usage
csv_file = "csv/output.csv"  
sorted_csv_file = "csv/sorted_output.csv"  
sort_csv_by_category(csv_file, sorted_csv_file)
