In [2]:
import spacy
import pandas as pd
from textblob import TextBlob

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm") 

# Load dataset
data = pd.read_csv('data/sample_5000rows.csv') 
ledes = data['lede']  
data['lede'].fillna("No lede", inplace=True)
print(ledes)

0       calgary, alberta — some bruins rely on plant-b...
1       washington — a senior treasury department empl...
2       edwidge danticat spoke at the luncheon to bene...
3       istanbul — his killers were waiting when jamal...
4       the new england journal of medicine on wednesd...
                              ...                        
4996     the three previous undefeated ephmen teams we...
4997     mr. stinnett's stories appeared in atlantic m...
4998     atlanta is on a roll, tied at 7-2 with san fr...
4999                                              No lede
5000                                              No lede
Name: lede, Length: 5001, dtype: object


In [4]:
# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [5]:
# Function to extract entities context
def extract_entities_with_context(text, window=5):
    doc = nlp(text)
    entity_context = []
    for ent in doc.ents:
        start = max(0, ent.start - window)
        end = min(len(doc), ent.end + window)
        context = doc[start:end].text
        entity_context.append((ent.text, ent.label_, context))
    return entity_context


In [18]:
# data['entities'] = ledes.apply(extract_entities)
data['entity_context'] = ledes.apply(extract_entities_with_context)

In [7]:
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [8]:
# Function to analyze sentiment around each entity
def analyze_entity_sentiments(entity_contexts):
    sentiments = []
    for text, label, context in entity_contexts:
        sentiment = get_sentiment(context)
        sentiments.append((text, label, sentiment))
    return sentiments

In [9]:
# just store sentiment score
def analyze_entity_sentiments_score(entity_contexts):
    sentiments = []
    for text, label, context in entity_contexts:
        sentiment = get_sentiment(context)
        sentiments.append((sentiment))
    return sentiments

In [14]:
# Calculate sentiment score
data['entity_sentiments_scoreonly'] = data['entity_context'].apply(analyze_entity_sentiments_score)
data['most_negative_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: min(x) if x else float('inf'))
data['most_positive_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: max(x) if x else float('inf'))
data['average_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: sum(x) / len(x) if x else float('inf'))

In [15]:
def categorize_sentiment(score):
    if score <= -0.1:
        return 'Negative'
    elif score >= 0.1:
        return 'Positive'
    else:
        return 'Neutral'

In [16]:
data['most_negative_sentiment'] = data['most_negative_score'].apply(categorize_sentiment)
data['most_positive_sentiment'] = data['most_positive_score'].apply(categorize_sentiment)
data['average_sentiment'] = data['average_score'].apply(categorize_sentiment)

In [19]:
# Step 4: Apply the function to each lede
# print(data.head())
data.to_csv('data/sample_5000rows_spacy_textblob.csv') 
