In [5]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

# Load CSV data
df = pd.read_csv(r'C:\Users\Avdh1215\Desktop\Intellewings_assignment\Training_ner_model\trainingdata.csv')

# Initialize spaCy
nlp = spacy.blank("en")

# Create DocBin to store examples
db = DocBin()

for index, row in df.iterrows():
    text = row['Sentence']
    entities = eval(row['Entities'])  # Convert string representation of list to actual list
    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)

    doc.ents = ents
    db.add(doc)

# Save the training data
db.to_disk("train.spacy")


In [6]:
import spacy
from spacy.training import Example
from spacy.util import minibatch

# Load the training data
db = DocBin().from_disk("train.spacy")
docs = list(db.get_docs(nlp.vocab))

# Create an empty NER model
ner = nlp.create_pipe("ner")
nlp.add_pipe("ner")

# Add the labels
for doc in docs:
    for ent in doc.ents:
        ner.add_label(ent.label_)

# Initialize the model
nlp.begin_training()

# Training the model
for epoch in range(10):
    losses = {}
    batches = minibatch(docs, size=8)
    for batch in batches:
        examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch]
        nlp.update(examples, losses=losses)
    print(f"Epoch {epoch} Losses: {losses}")

# Save the trained model
nlp.to_disk("ner_model")


Epoch 0 Losses: {'ner': np.float32(434.0519)}
Epoch 1 Losses: {'ner': np.float32(143.93134)}
Epoch 2 Losses: {'ner': np.float32(89.87609)}
Epoch 3 Losses: {'ner': np.float32(56.620415)}
Epoch 4 Losses: {'ner': np.float32(33.927834)}
Epoch 5 Losses: {'ner': np.float32(18.491497)}
Epoch 6 Losses: {'ner': np.float32(20.732412)}
Epoch 7 Losses: {'ner': np.float32(10.343407)}
Epoch 8 Losses: {'ner': np.float32(9.910982)}
Epoch 9 Losses: {'ner': np.float32(3.90321)}


In [7]:
import spacy

# Load the trained NER model
nlp = spacy.load("ner_model")


In [8]:
# Sample text to test
sample_text = "Jeff Bezos founded Amazon and also owns The Washington Post. He has also worked with NASA on various projects."

# Process the text with the NER model
doc = nlp(sample_text)

# Print the entities recognized by the model
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: Jeff Bezos, Label: PERSON
Entity: The Washington Post., Label: ORG
Entity: He, Label: ORG
Entity: NASA, Label: PRODUCT


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import spacy

# Load the trained NER model
nlp = spacy.load("ner_model")


In [13]:
def fetch_html(url):
    """
    Takes a URL as input and retrieves the HTML content of the article.
    
    Args:
    url (str): The URL of the news article.

    Returns:
    str: The HTML content of the article.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Return the HTML content
            return soup
        else:
            print(f"Failed to retrieve the article. HTTP Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def extract_entities(text):
    """
    Extracts named entities from the given text using the NER model.
    
    Args:
    text (str): The text to extract entities from.

    Returns:
    dict: A dictionary containing lists of 'PERSON' and 'ORG' entities.
    """
    doc = nlp(text)
    entities = {"PERSON": [], "ORG": []}
    
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
            
    return entities


In [14]:
# Example usage
if __name__ == "__main__":
    # Take a URL input
    article_url = input("Enter the news article URL: ")
    
    # Fetch the HTML content
    html_content = fetch_html(article_url)
    
    if html_content:
        print("Successfully retrieved the article's HTML content.")
        
        # Extract text content from HTML
        article_text = html_content.get_text(separator=' ', strip=True)
        
        # Extract entities from the article text
        entities = extract_entities(article_text)
        
        # Prepare the data to be saved (including the URL, HTML content, and extracted entities)
        article_data = {
            "URL": article_url,
            "HTML_Content": str(html_content.prettify()),
            "Extracted_PERSON": str(entities["PERSON"]),
            "Extracted_ORG": str(entities["ORG"]),
            "Date_Fetched": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        # Create a DataFrame
        df = pd.DataFrame([article_data])
        print(df)
    else:
        print("Failed to retrieve article content.")


Successfully retrieved the article's HTML content.
                                                 URL  \
0  https://economictimes.indiatimes.com/sme/enabl...   

                                        HTML_Content  \
0  <!DOCTYPE html>\n<html class="no-js" lang="en"...   

                                    Extracted_PERSON  \
0  ['Benchmarks Nifty', 'Fund Direct', 'News Indu...   

                                       Extracted_ORG         Date_Fetched  
0  ['Enabling', 'Payoneer', 'FUNDS', 'Growt', 'FU...  2024-12-16 01:59:53  


In [15]:
df

Unnamed: 0,URL,HTML_Content,Extracted_PERSON,Extracted_ORG,Date_Fetched
0,https://economictimes.indiatimes.com/sme/enabl...,"<!DOCTYPE html>\n<html class=""no-js"" lang=""en""...","['Benchmarks Nifty', 'Fund Direct', 'News Indu...","['Enabling', 'Payoneer', 'FUNDS', 'Growt', 'FU...",2024-12-16 01:59:53


In [22]:
df.Extracted_PERSON.to_string()

"0    ['Benchmarks Nifty', 'Fund Direct', 'News Indu..."