In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
###climate_entities
import pandas as pd

climate_entities_df = pd.read_csv('/content/drive/MyDrive/Labeled Dataset/extracted_unique_entities.csv')

climate_entities = set(entity for entity in climate_entities_df['entities'].tolist() if isinstance(entity, str))



data = pd.read_csv('/content/drive/MyDrive/Labeled Dataset/climate_agreement.csv')

# check and extract entities mentioned in the text
def extract_entities(text):
    if pd.isnull(text):  # Check if text is NaN
        return []
    return [entity for entity in climate_entities if entity in text]

# Extract entities for each row's 'body_parent' and 'body_child'
data['entity_parent'] = data['body_parent'].apply(extract_entities)
data['entity_child'] = data['body_child'].apply(extract_entities)

filtered_data = data[(data['entity_parent'].apply(len) > 0) & (data['entity_child'].apply(len) > 0)]

# Combine entities and remove duplicates
filtered_data = filtered_data.copy()  # Make a deep copy
filtered_data['entity'] = filtered_data.apply(lambda row: sorted(set(row['entity_parent'] + row['entity_child'])), axis=1)

exploded_data = filtered_data.explode('entity')

result = exploded_data[['msg_id_parent', 'msg_id_child', 'body_parent', 'body_child', 'label', 'entity', 'exact_time', 'datetime']]

unwanted_entities = [', ', 'n', 'N', 'P', 'F', 's']
result = result[~result['entity'].isin(unwanted_entities) & ~(result['entity'].str.len() == 1)]

result.reset_index(drop=True, inplace=True)


result.to_csv('/content/drive/MyDrive/Labeled Dataset/long_filtered_climate.csv', index=False)




In [None]:
pip install transformers


Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.8 MB/s[0m eta [36m0:00:0

In [None]:
#compute entity-based sentiment scores, An entity may only appears in either body parent or body child in one pair, but we
# want to compute the sentiment scores of both texts separately towards one entity. So we stick with the sentiment analysis pipeline from HuggingFace's Transformers. The function will be similar to before but will return a neutral score for texts that don't have the entity.
import pandas as pd
from transformers import pipeline

entities_df = pd.read_csv('/content/drive/MyDrive/Labeled Dataset/extracted_unique_entities.csv')
climate_entities = entities_df['entities'].dropna().astype(str).tolist()


data = pd.read_csv('/content/drive/MyDrive/Labeled Dataset/long_filtered_climate.csv')

# Initialize the sentiment-analysis pipeline from HuggingFace's Transformers
sentiment_pipeline = pipeline("sentiment-analysis")

def compute_overall_sentiment(text):
    # Check for NaN or None values
    if pd.isna(text):
        return None


    sentiment = sentiment_pipeline(text)[0]

    if sentiment['label'] == "POSITIVE":
        return sentiment['score']
    elif sentiment['label'] == "NEGATIVE":
        return -sentiment['score']
    else:
        return 0.0

def compute_entity_sentiment(text, entity):
    if entity in text:
        start_idx = max(text.find(entity) - 30, 0)  # 30 characters before the entity
        end_idx = min(text.find(entity) + len(entity) + 30, len(text))  # 30 characters after the entity
        context = text[start_idx:end_idx]
        sentiment = sentiment_pipeline(context)[0]
        return sentiment['label'], sentiment['score']
    else:
        return "NEUTRAL", 0.0

# Calculate overall sentiment scores for 'body_parent' and 'body_child'
data['sentiment_parent'] = data['body_parent'].apply(compute_overall_sentiment)
data['sentiment_child'] = data['body_child'].apply(compute_overall_sentiment)


for entity in climate_entities:

    data[f'sentiment_parent_{entity}'], data[f'score_parent_{entity}'] = zip(*data['body_parent'].apply(lambda x: compute_entity_sentiment(x, entity)))


    data[f'sentiment_child_{entity}'], data[f'score_child_{entity}'] = zip(*data['body_child'].apply(lambda x: compute_entity_sentiment(x, entity)))


data.to_csv('/content/drive/MyDrive/Labeled Dataset/long_sentiments.csv', index=False)


In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB