## Task8

In [None]:
import pandas as pd
import spacy
from collections import defaultdict

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract entities from text along with their probabilities
def extract_entities_with_scores(text):
    try:
        doc = nlp(str(text))
        entity_counts = defaultdict(int)
        total_entities = 0
        for ent in doc.ents:
            entity_counts[(ent.text, ent.label_)] += 1
            total_entities += 1
        # Normalize scores
        entities_with_scores = [{'text': key[0], 'label': key[1], 'score': entity_counts[key] / total_entities} for key in entity_counts]
        return entities_with_scores
    except Exception as e:
        print(f"Error processing text: {text}")
        print(e)
        return []

# Read the dataset
bfro_data = pd.read_csv("../Data/task7_dataset.csv")

# List of text columns to extract entities from
text_columns = ["Headline", "Location Details", "Also Noticed", "Other Stories",
                "Time And Conditions", "Environment", "Follow-Up Report",
                "Detected Object(s)", "Image Caption(s)"]

# Replace NaN values with empty strings in text columns
bfro_data[text_columns] = bfro_data[text_columns].fillna('')

# Apply the function to each text column
for column in text_columns:
    print(f"Processing column: {column}")
    bfro_data[f'{column}_entities_with_scores'] = bfro_data[column].apply(extract_entities_with_scores)

# Save the updated dataset as CSV
bfro_data.to_csv("task8_dataset.csv", index=False)

# Save the updated dataset as TSV
bfro_data.to_csv("task8_dataset.tsv", sep='\t', index=False)


Processing column: Headline
Processing column: Location Details
Processing column: Also Noticed
Processing column: Other Stories
Processing column: Time And Conditions
Processing column: Environment
Processing column: Follow-Up Report
Processing column: Detected Object(s)
Processing column: Image Caption(s)
