In [1]:
# Notebook Name: Generate_CoT_Sentiment_Analysis.ipynb
# Import necessary libraries
import os
import time
from tqdm import tqdm
from openai import OpenAI
from datasets import load_dataset, Value
from huggingface_hub import login

In [2]:
# # Load API keys from environment variables
# api_key = os.getenv("DeepSeek_API_KEY")
# hf_token = os.getenv("HF_TOKEN")
#
# # Ensure API keys are available
# if not api_key or not hf_token:
#     raise ValueError("Missing API keys! Set 'DeepSeek_API_KEY' and 'HF_TOKEN' as environment variables.")

In [None]:
# Load API keys from Kaggle Secrets
from kaggle_secrets import UserSecretsClient


def load_kaggle_secrets():
    """
    Load API keys from Kaggle Secrets.
    """
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("DeepSeek_API_KEY")
    hf_token = user_secrets.get_secret("HF_TOKEN")
    return api_key, hf_token


# Load API keys
api_key, hf_token = load_kaggle_secrets()

# Ensure API keys are available
if not api_key or not hf_token:
    raise ValueError("Missing API keys! Add 'DeepSeek_API_KEY' and 'HF_TOKEN' to your Kaggle Secrets.")

In [3]:
# Function to initialize the API client
def initialize_api_client(api_key):
    """
    Initialize the OpenAI client with DeepSeek API.
    """
    return OpenAI(
        api_key=api_key,
        base_url="https://api.deepseek.com"
    )

In [4]:
# Function to generate Chain of Thought reasoning
def generate_cot(client, text, label, max_retries=3):
    """
    Generate a Chain of Thought (CoT) explanation for sentiment analysis.
    """
    sentiment_map = {0: "negative", 1: "positive"}

    prompt = f"""As a sentiment analysis expert, generate a step-by-step Chain of Thought (CoT) in English to explain why the following text is {sentiment_map[label]}.
The CoT should follow this structure:
1. Identify key sentiment-bearing words/phrases
2. Analyze contextual clues
3. Consider linguistic patterns
4. Synthesize overall sentiment
5. Conclude with the final sentiment label (0 for negative, 1 for positive)

Text: {text}
CoT:"""

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are an expert in sentiment analysis and logical reasoning."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=300,
                stream=False
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt+1} failed, retrying...")
                time.sleep(2)
            else:
                print(f"Failed after {max_retries} attempts: {str(e)}")
                return "CoT generation failed"

In [5]:
# Function to add CoT to the dataset
def add_cot_to_dataset(client, dataset, sample_size=None, sleep_time=1):
    """
    Add Chain of Thought explanations to a dataset.
    """
    if sample_size:
        dataset = dataset.select(range(min(sample_size, len(dataset))))

    texts = dataset["sentence"]
    labels = dataset["label"]

    cots = []
    for text, label in tqdm(zip(texts, labels), total=len(texts)):
        cot = generate_cot(client, text, label)
        cots.append(cot)
        time.sleep(sleep_time)

    return dataset.add_column("Complex_CoT", cots)


In [6]:
# Function to upload the dataset to Hugging Face
def upload_to_huggingface(dataset, dataset_name, hf_token, is_private=False):
    """
    Upload the enhanced dataset to Hugging Face Hub.
    """
    login(token=hf_token)

    dataset = dataset.cast_column("Complex_CoT", Value("string"))
    # 设置数据集的描述信息
    dataset.info.description = "SST-2 dataset enhanced with Chain-of-Thought reasoning for sentiment analysis."



    print(f"Uploading dataset to {dataset_name}...")
    dataset.push_to_hub(repo_id=dataset_name, private=is_private, token=hf_token)
    print(f"Dataset successfully uploaded to {dataset_name}")


In [7]:
# Main function to generate and upload the CoT dataset
def create_cot_dataset(api_key, hf_token, dataset_name, sample_size=None):
    """
    Create and upload a Chain of Thought dataset for sentiment analysis.
    """
    client = initialize_api_client(api_key)

    print("Loading SST-2 dataset...")
    dataset = load_dataset("glue", "sst2", split="train[:1]")

    print("Generating Chain of Thought explanations...")
    enhanced_dataset = add_cot_to_dataset(client, dataset, sample_size=sample_size)

    upload_to_huggingface(enhanced_dataset, dataset_name, hf_token)

    return enhanced_dataset


In [None]:
# Define dataset name
dataset_name = "MelodyOfTears/sst2-with-cot"

# Create and upload the dataset
enhanced_dataset = create_cot_dataset(api_key, hf_token, dataset_name)

Loading SST-2 dataset...
Generating Chain of Thought explanations...


  2%|▏         | 8/500 [02:16<2:18:49, 16.93s/it]

In [None]:
# Print an example
print(enhanced_dataset[0]['Complex_CoT'])