In [2]:
!pip install transformers

!pip install python-docx

!pip install pdfminer.six
!pip install PyPDF2


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import transformers
from transformers import pipeline
from pdfminer.high_level import extract_text
import docx
from PyPDF2 import PdfReader

# Load the sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Load the summarization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Define the translation models
translation_models = {
    "Spanish": "Helsinki-NLP/opus-mt-en-es",
    "French": "Helsinki-NLP/opus-mt-en-fr",
    "German": "Helsinki-NLP/opus-mt-en-de",
    "Chinese (Simplified)": "Helsinki-NLP/opus-mt-en-zh",
    "Chinese (Traditional)": "Helsinki-NLP/opus-mt-en-zh",
    "Japanese": "Helsinki-NLP/opus-mt-en-jap",
    "Russian": "Helsinki-NLP/opus-mt-en-ru",
    "Arabic": "Helsinki-NLP/opus-mt-en-ar",
    "Italian": "Helsinki-NLP/opus-mt-en-it"
}

# Load the zero-shot learning model
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Load the question-answering model
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text
    return text

def analyze_news(input_data):
    # Read the input data
    if isinstance(input_data, str):
        if input_data.endswith(".txt"):
            with open(input_data, "r") as f:
                text = f.read()
        elif input_data.endswith(".pdf"):
            text = extract_text_from_pdf(input_data)
        elif input_data.endswith(".docx"):
            text = extract_text_from_docx(input_data)
        else:
            text = input_data
    else:
        raise ValueError("Input data must be a file path or a string")


    # Display menu for user choice
    while True:
        print("\nPlease select an option:")
        print("1. News is Positive or Negative?")
        print("2. Topic of News?")
        print("3. Summarization of News")
        print("4. Translation of News")
        print("5. Question Answering ")
        print("0. Exit")

        choice = int(input("Enter the number corresponding to your choice: "))

        if choice == 0:
            print("Exiting...")
            break
        elif choice == 1:
            # Sentiment analysis
            sentiment_results = sentiment_pipeline(text)
            print("News is Positive or Negative?:")
            for result in sentiment_results:
                print(f"News: {result['label']}, Score: {result['score']}")
        elif choice == 2:
            # Zero-shot learning
            labels = ["Sports", "Entertainment", "Crime", "Politics", "Weather",
                      "Business/Economics", "Technology", "Health", "Environment",
                      "Science", "Education", "Human Interest", "Travel",
                      "Arts & Culture", "Opinion/Editorial"]
            zero_shot_results = zero_shot_classifier(text, candidate_labels=labels)
            print("Topic of News:")
            print(f"News: {zero_shot_results['labels'][0]}, Score: {zero_shot_results['scores'][0]}")
        elif choice == 3:
            # Summarization
            summary = summarizer(text, max_length=min(400, len(text)//2), clean_up_tokenization_spaces=True)
            summary_text = summary[0]["summary_text"]
            print("Summary of News:")
            print(summary_text)
        elif choice == 4:
            # Ask the user for the target language for translation
            print("\nPlease select the target language for translation:")
            for i, lang in enumerate(translation_models.keys(), 1):
                print(f"{i}. {lang}")

            lang_choice = int(input("Enter the number corresponding to your choice: "))
            target_language = list(translation_models.keys())[lang_choice - 1]
            translation_model = translation_models[target_language]

            # Load the selected translation model
            translator = pipeline("translation_en_to_" + target_language.split()[0].lower(), model=translation_model)

            # Translate the input text
            translated_text = translator(text, return_text=True)
            print(f"\nTranslated News (to {target_language}):")
            print(translated_text)
        elif choice == 5:
            # Question-Answering system
            while True:
                ask_question = input("\nDo you want to ask a question about the input text? (yes/no): ")
                if ask_question.lower() == 'yes':
                    question = input("Please enter your question: ")
                    qa_result = qa_pipeline(question=question, context=text)
                    print(f"Answer: {qa_result['answer']}")
                else:
                    break
        else:
            print("Invalid choice.")

# Main program to interact with the user
def main():
    print("Welcome to the News Analysis Tool!")
    print("Please choose the input method:")
    print("1. Enter text directly")
    print("2. Provide a file (txt, pdf, docx)")

    input_choice = int(input("Enter the number corresponding to your choice: "))

    if input_choice == 1:
        input_data = input("Please enter the text: ")
    elif input_choice == 2:
        file_path = input("Please provide the file path: ")
        input_data = file_path
    else:
        print("Invalid choice. Exiting.")
        return

    analyze_news(input_data)

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Welcome to the News Analysis Tool!
Please choose the input method:
1. Enter text directly
2. Provide a file (txt, pdf, docx)
Enter the number corresponding to your choice: 2
Please provide the file path: /content/Prospectus Updated 2022.pdf

Please select an option:
1. News is Positive or Negative?
2. Topic of News?
3. Summarization of News
4. Translation of News
5. Question Answering 
0. Exit
Enter the number corresponding to your choice: 5

Do you want to ask a question about the input text? (yes/no): yes
Please enter your question: who is the founder of UMT?
Answer: Dr Hasan Sohaib 
Murad

Do you want to ask a question about the input text? (yes/no): yes
Please enter your question: where is UMT main branch?
Answer: Annual System

Do you want to ask a question about the input text? (yes/no): YES
Please enter your question: When UMT founded?
Answer: 1990

Do you want to ask a question about the input text? (yes/no): yes
Please enter your question: what is the message of UMT president 