<a href="https://colab.research.google.com/github/AmirishettyAkhila/2303A51L04/blob/main/Untitled34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import nltk
from nltk.tokenize import sent_tokenize
import re
import os

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Try to import datasets package; handle import errors
try:
    from datasets import load_dataset
    datasets_installed = True
except ImportError:
    datasets_installed = False

def load_text_from_file(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return file.read()
    else:
        return "File not found."

def load_text_from_csv(file_path, column_name):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        return df[column_name].tolist()
    else:
        return "CSV file not found."

def load_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        return response.text
    except requests.RequestException as e:
        return f"Error loading URL: {e}"

def load_text_from_nltk():
    try:
        from nltk.corpus import gutenberg
        return gutenberg.raw('austen-emma.txt')
    except Exception as e:
        return f"Error loading NLTK data: {e}"

def load_text_from_huggingface(dataset_name):
    if datasets_installed:
        try:
            dataset = load_dataset(dataset_name, split='train')  # Example: IMDb dataset
            return dataset['text'][:5]  # Get the first 5 entries
        except Exception as e:
            return f"Error loading Hugging Face dataset: {e}"
    else:
        return "Hugging Face datasets library is not installed."

def remove_ambiguity(text):
    # Example replacements for ambiguity
    text = re.sub(r'\bbank\b', 'financial institution', text)
    text = re.sub(r'\badjustable-rate mortgage securities\b', 'financial products', text)
    return text

def segment_sentences(text):
    return sent_tokenize(text)

def main():
    # Paths and URLs for demonstration
    file_path = 'path_to_file.txt'  # Update this with your file path
    csv_path = 'path_to_file.csv'   # Update this with your file path
    csv_column = 'column_name'      # Update this with your column name
    url = 'https://example.com/textfile.txt'  # Update this with your URL
    dataset_name = 'imdb'           # Update this with your dataset name

    # Load text data
    file_text = load_text_from_file(file_path)
    csv_text = load_text_from_csv(csv_path, csv_column)
    url_text = load_text_from_url(url)
    nltk_text = load_text_from_nltk()
    hf_text = load_text_from_huggingface(dataset_name)

    # Print loaded data
    print("Text from file:\n", file_text)
    print("\nText from CSV:\n", csv_text)
    print("\nText from URL:\n", url_text)
    print("\nText from NLTK:\n", nltk_text)
    print("\nText from Hugging Face:\n", hf_text)

    # Example text for ambiguity removal and sentence segmentation
    example_text = "The bank can guarantee deposits will eventually cover future tuition costs because it invests in adjustable-rate mortgage securities."

    # Remove ambiguity
    cleaned_text = remove_ambiguity(example_text)
    print("\nCleaned Text:\n", cleaned_text)

    # Segment sentences
    sentences = segment_sentences(cleaned_text)
    print("\nSegmented Sentences:")
    for sentence in sentences:
        print(sentence)

if __name__ == "__main__":
    main()
