<a href="https://colab.research.google.com/github/Dalicebo1990/D/blob/main/AIWorkflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import gensim
from gensim import corpora
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel
from urllib.parse import urlparse

# Ensure NLTK resources are downloaded
nltk.download('punkt')

# Function to validate and format the URL
def validate_url(url):
    parsed_url = urlparse(url)
    if not parsed_url.scheme:
        url = "https://" + url
    return url

# Web scraping example
def extract_data(url):
    try:
        # Validate the URL to ensure it includes a scheme (e.g., https://)
        url = validate_url(url)

        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract meaningful content (e.g., title, description, etc.)
        title = soup.title.string if soup.title else 'Title Not Found'
        meta_description = soup.find('meta', attrs={'name': 'description'})
        description = meta_description['content'] if meta_description else 'Description Not Found'

        return {"title": title, "description": description}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# NLP example: Keyword extraction, sentiment analysis, and topic modeling
def analyze_text(data):
    text = f"{data['title']}. {data['description']}"

    # Sentiment analysis using TextBlob
    blob = TextBlob(text)
    sentiment = blob.sentiment

    # Tokenize the text for topic modeling
    tokens = word_tokenize(text.lower())
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]

    # Perform topic modeling using LDA (Latent Dirichlet Allocation)
    lda_model = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=4)

    return {
        "sentiment": sentiment,
        "topics": topics
    }

# Article generation example
def generate_article(data, analysis):
    article = f"""
    Company Overview:
    ------------------
    Title: {data['title']}
    Description: {data['description']}

    Sentiment Analysis:
    -------------------
    Polarity: {analysis['sentiment'].polarity}
    Subjectivity: {analysis['sentiment'].subjectivity}

    Topic Modeling:
    -------------------
    Main topics identified: {analysis['topics']}
    """

    return article

# Workflow integration example
def process_company(url):
    data = extract_data(url)
    if data:  # Proceed only if data was successfully scraped
        analysis = analyze_text(data)
        article = generate_article(data, analysis)
        return article
    else:
        return "Failed to extract data from the website."

# Main function (Hardcoded URL)
if __name__ == '__main__':
    url = "https://www.ibm.com"  # Hardcoded example URL
    article = process_company(url)
    print(article)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



    Company Overview:
    ------------------
    Title: IBM - United States
    Description: For more than a century, IBM has been a global technology innovator, leading advances in AI, automation and hybrid cloud solutions that help businesses grow.
    
    Sentiment Analysis:
    -------------------
    Polarity: 0.25
    Subjectivity: 0.25

    Topic Modeling:
    -------------------
    Main topics identified: [(0, '0.063*"," + 0.048*"ibm" + 0.048*"a" + 0.048*"."')]
    
