In [17]:
# Import libraries
import pandas as pd
from typing import Dict
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Setting secret credentials
from dotenv import load_dotenv #pip install python-dotenv
load_dotenv()
sys.path.append('../src')
# Import feature engineering functions
from preprocess import clean_text

In [18]:
# load the dataset
df = pd.read_csv('../data/reddit_raw.csv')
df.head()

Unnamed: 0,subreddit,title,text,url,created,keyword,score
0,ukraine,Volunteering in civilian roles,"Hi,\n\nI’m an American. When the war broke out...",https://www.reddit.com/r/ukraine/comments/1m3v...,2025-07-19 14:54:55,refugee,62
1,ukraine,<3,As a Polish person I just came here to tell yo...,https://www.reddit.com/r/ukraine/comments/1lxf...,2025-07-11 21:03:10,refugee,544
2,ukraine,The Angry Ukrainian Syndrome: Injustice and St...,I found this useful for understanding my own b...,https://www.reddit.com/r/ukraine/comments/1lio...,2025-06-23 20:38:06,refugee,230
3,ukraine,I wrote this letter to my representatives in c...,**Find your representatives here:** [**https:/...,https://www.reddit.com/r/ukraine/comments/1lfq...,2025-06-20 02:45:54,refugee,125
4,ukraine,Looking forward - would love to hear ppl's tho...,"Sorry for the vague question, but I only have ...",https://www.reddit.com/r/ukraine/comments/1kv8...,2025-05-25 19:56:51,refugee,13


In [19]:
# Preprocess the text data
df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,subreddit,title,text,url,created,keyword,score
0,ukraine,Volunteering in civilian roles,"Hi, I’m an American. When the war broke out, I...",https://www.reddit.com/r/ukraine/comments/1m3v...,2025-07-19 14:54:55,refugee,62
1,ukraine,<3,As a Polish person I just came here to tell yo...,https://www.reddit.com/r/ukraine/comments/1lxf...,2025-07-11 21:03:10,refugee,544
2,ukraine,The Angry Ukrainian Syndrome: Injustice and St...,I found this useful for understanding my own b...,https://www.reddit.com/r/ukraine/comments/1lio...,2025-06-23 20:38:06,refugee,230
3,ukraine,I wrote this letter to my representatives in c...,Find your representatives here: Subject: Urgen...,https://www.reddit.com/r/ukraine/comments/1lfq...,2025-06-20 02:45:54,refugee,125
4,ukraine,Looking forward - would love to hear ppl's tho...,"Sorry for the vague question, but I only have ...",https://www.reddit.com/r/ukraine/comments/1kv8...,2025-05-25 19:56:51,refugee,13


In [20]:
from google import genai
from google.genai import types # pip install google-genai==1.7.0

from IPython.display import HTML, Markdown, display

In [None]:
from google.api_core import retry # pip install google-api-core
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})
genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

In [None]:
# Set up your API key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize Gemini client
client = genai.Client(api_key=GOOGLE_API_KEY)

In [23]:

def analyze_sentiment_gemini(text: str, max_output_tokens=50) -> Dict[str, float]:
    """
    Performs sentiment analysis on a single text using Gemini 2.0 Flash.

    Args:
        text (str): The input text to analyze.
        max_output_tokens (int): Max tokens for Gemini output.

    Returns:
        Dict[str, Any]: Dictionary with keys 'sentiment' (str) and 'confidence' (float).
    """
    try:
        prompt = (
            "Classify the sentiment of the following text as Positive, Negative, or Neutral. "
            "Respond ONLY with the sentiment label and confidence score as a decimal between 0 and 1, "
            "in JSON format like this: {\"sentiment\": \"Positive\", \"confidence\": 0.87}\n\n"
            f"Text:\n{text}"
        )
        
        config = types.GenerateContentConfig(temperature=0.0, max_output_tokens=max_output_tokens)

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt,
            config=config
        )

        # Expecting JSON output, so parse it
        import json
        result = json.loads(response.text.strip())

        # Validate and normalize keys (optional)
        sentiment = result.get('sentiment', 'Neutral').capitalize()
        confidence = float(result.get('confidence', 0.0))

        return {
            'sentiment': sentiment,
            'confidence': confidence
        }

    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return {
            'sentiment': 'Neutral',
            'confidence': 0.0
        }


def analyze_sentiments_in_df(df: pd.DataFrame, text_col='text') -> pd.DataFrame:
    """
    Applies Gemini sentiment analysis to all texts in a DataFrame column and adds results.

    Args:
        df (pd.DataFrame): Input DataFrame.
        text_col (str): Name of the column with text to analyze.

    Returns:
        pd.DataFrame: Original DataFrame with two new columns:
            - 'gemini_sentiment' (str)
            - 'gemini_confidence' (float)
    """
    def extract_sentiment(text):
        result = analyze_sentiment_gemini(text)
        return pd.Series([result['sentiment'], result['confidence']])
    
    df[['gemini_sentiment', 'gemini_confidence']] = df[text_col].apply(extract_sentiment)
    return df


In [24]:
# Perform a quick test of the sentiment analysis function
test_text = "The Reddit post is a fantastic shit!"

result = analyze_sentiment_gemini(test_text)
print(f"Text: {test_text}")
print(f"Sentiment: {result['sentiment']}")
print(f"Confidence: {result['confidence']:.2f}")

Text: The Reddit post is a fantastic shit!
Sentiment: Negative
Confidence: 0.95


In [25]:
# Perform sentiment analysis on the dataset
#df = analyze_sentiments_in_df(df, text_col='text')
#df[['text', 'gemini_sentiment', 'gemini_confidence']].head()

# Save the results to a new CSV file
#df[['gemini_sentiment', 'gemini_confidence']].to_csv('../data/reddit_sentiment_predicted.csv', index=False)  