In [1]:
import spacy
import pandas as pd
import NewsSentiment
from NewsSentiment import TargetSentimentClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")  

# Load dataset
data = pd.read_csv('data/sample_1000rows.csv') 
data['lede'].fillna("No lede", inplace=True)
ledes = data['lede'].apply(lambda x: ' '.join(x.split()[:100]))
print(ledes)

0       calgary, alberta — some bruins rely on plant-b...
1       washington — a senior treasury department empl...
2       edwidge danticat spoke at the luncheon to bene...
3       istanbul — his killers were waiting when jamal...
4       the new england journal of medicine on wednesd...
                              ...                        
996     the contrasting treatment captured the opposit...
997     though the american airlines pilot soon assure...
998     after concerns were voiced at a recent neighbo...
999     tucked inside the giant federal spending bill ...
1000    as sheila mcgovern, chief judge for the county...
Name: lede, Length: 1001, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['lede'].fillna("No lede", inplace=True)


In [29]:
# my_ledes = [tuple(string.split(' ', 2)) + ('NA',) * (3 - len(string.split(' ', 2))) for string in ledes]
# add logic to make sure no splitted part is empty
my_ledes = [
    tuple(filtered_parts) + ('NA',) * (3 - len(filtered_parts))
    for filtered_parts in ([part for part in s.strip().split(' ', 2) if part] for s in ledes)
]
# print(my_ledes)

In [34]:
tsc = TargetSentimentClassifier()
sentiments = tsc.infer(targets=my_ledes)

res = []
for i, result in enumerate(sentiments):
    res.append(result[0]['class_label'])
    # print("Sentiment: ", result[0]['class_label'])


Processing batches:   4%|▍         | 43/1001 [00:10<03:44,  4.27batch/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:   6%|▌         | 56/1001 [00:13<03:38,  4.33batch/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:   8%|▊         | 76/1001 [00:18<03:28,  4.45batch/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:   9%|▉         | 92/1001 [00:21<03:36,  4.20batch/s]Be aware, overflowing tokens are n

In [40]:
data['roberta_sentiment_basedOnLedes'] = res
data.to_csv('data/sample_1000rows_Roberta.csv', index=False)

In [42]:
# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [43]:
# Function to extract entities context
def extract_entities_with_context(text, window=5):
    doc = nlp(text)
    entity_context = []
    for ent in doc.ents:
        start = max(0, ent.start - window)
        end = min(len(doc), ent.end + window)
        context = doc[start:end].text
        entity_context.append((ent.text, ent.label_, context))
    return entity_context


In [44]:
# data['entities'] = ledes.apply(extract_entities)
data['entity_context'] = ledes.apply(extract_entities_with_context)

In [7]:
def get_sentiment(text):
    tsc = TargetSentimentClassifier()
    sentiments = tsc.infer(targets=text)

    res = []
    for i, result in enumerate(sentiments):
        res.append(result[0]['class_label'])
        # print("Sentiment: ", result[0]['class_label'])
    return res

In [8]:
# Function to analyze sentiment around each entity
def analyze_entity_sentiments(entity_contexts):
    sentiments = []
    for text, label, context in entity_contexts:
        sentiment = get_sentiment(context)
        sentiments.append((text, label, sentiment))
    return sentiments

In [9]:
# just store sentiment score
def analyze_entity_sentiments_score(entity_contexts):
    sentiments = []
    for text, label, context in entity_contexts:
        sentiment = get_sentiment(context)
        sentiments.append((sentiment))
    return sentiments

In [14]:
# Calculate sentiment score
data['entity_sentiments_scoreonly'] = data['entity_context'].apply(analyze_entity_sentiments_score)
data['most_negative_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: min(x) if x else float('inf'))
data['most_positive_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: max(x) if x else float('inf'))
data['average_score'] = data['entity_sentiments_scoreonly'].apply(lambda x: sum(x) / len(x) if x else float('inf'))

In [15]:
def categorize_sentiment(score):
    if score <= -0.1:
        return 'Negative'
    elif score >= 0.1:
        return 'Positive'
    else:
        return 'Neutral'

In [16]:
data['most_negative_sentiment'] = data['most_negative_score'].apply(categorize_sentiment)
data['most_positive_sentiment'] = data['most_positive_score'].apply(categorize_sentiment)
data['average_sentiment'] = data['average_score'].apply(categorize_sentiment)

In [19]:
# Step 4: Apply the function to each lede
# print(data.head())
data.to_csv('data/sample_1000rows_spacy_roberta.csv') 
