In [None]:
import os
import re
import pandas as pd
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

print(torch.__version__)
print("‚úÖ Torch is working and detected!")

In [None]:
# ---------- CONFIG ----------
base_dir = r" "
category = "Businessmen"
input_root = os.path.join(base_dir, "Step1", category)
output_root = os.path.join(base_dir, "Step2", category)

# Create output folder if it doesn‚Äôt exist
os.makedirs(output_root, exist_ok=True)

In [None]:
# ---------- LOAD SENTIMENT MODEL ----------
print("üîÑ Loading multilingual sentiment model (this may take a minute)...")
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print("‚úÖ Sentiment model loaded successfully.")

In [4]:
# ---------- FUNCTIONS ----------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"http\S+", "", text)       # remove links
    text = re.sub(r"[^\w\s,.!?@#]", "", text) # remove emojis/symbols
    return text.strip()


def get_sentiment_score(text):
    if not text or text.strip() == "":
        return 0
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].detach().numpy()
        score = probs[2] - probs[0]  # positive - negative
        return float(score)
    except Exception as e:
        print(f"‚ö†Ô∏è Sentiment error for text: {text[:50]}... ‚Üí {e}")
        return 0

In [None]:
# ---------- MAIN LOOP ----------


# target_persons = [
#     "Ricardo Salinas",
#     "Samuel Bankman",
#     "Tilman Fertitta",
#     "Tim Sweeney",
#     "Tobi Brown",
#     "Vinod Khosla"
# ]


for person_folder in os.listdir(input_root):
    # # Skip if not in our target list
    # if person_folder not in target_persons:
    #     continue

    person_path = os.path.join(input_root, person_folder)
    if not os.path.isdir(person_path):
        continue  # skip files

    print(f"\nüß† Processing: {person_folder} ...")

    input_file = os.path.join(person_path, "Output", f"{person_folder.replace(' ', '')}.xlsx")
    output_file = os.path.join(output_root, f"{person_folder.replace(' ', '')}_Weekly_Behavioral_Summary.xlsx")

    if not os.path.exists(input_file):
        print(f"‚ö†Ô∏è Skipping {person_folder} ‚Äî input file not found at {input_file}")
        continue

    # ---------- LOAD DATA ----------
    df = pd.read_excel(input_file)

    # ---------- PREPROCESS ----------
    df["creation_datetime"] = (
        df["creation_datetime"]
        .astype(str)
        .str.replace("***", " ", regex=False)
    )
    df["creation_datetime"] = pd.to_datetime(df["creation_datetime"], errors="coerce", infer_datetime_format=True)

    df["Tweet Type"] = df["Tweet Type"].fillna("").replace({
        "": "tweet",
        "retweeted": "retweet",
        "replied_to": "reply",
        "quoted": "quote"
    })

    df["clean_text"] = df["text"].apply(clean_text)
    df["length"] = df["clean_text"].str.len()

    print("üí¨ Computing sentiment scores (this may take a few minutes)...")
    df["sentiment"] = df["clean_text"].apply(get_sentiment_score)

    # ---------- WEEKLY AGGREGATION ----------
    df["week"] = df["creation_datetime"].dt.to_period("W-SUN").dt.start_time  # week ending Sunday

    weekly = df.groupby("week").agg(
        tweet_count=("tweet_id", "count"),
        avg_length=("length", "mean"),
        prop_tweet=("Tweet Type", lambda x: (x == "tweet").mean()),
        prop_retweet=("Tweet Type", lambda x: (x == "retweet").mean()),
        prop_reply=("Tweet Type", lambda x: (x == "reply").mean()),
        prop_quote=("Tweet Type", lambda x: (x == "quote").mean()),
        avg_sentiment=("sentiment", "mean")
    ).reset_index()

    # ---------- SAVE ----------
    print(f"üíæ Saving summary to: {output_file}")
    weekly.to_excel(output_file, index=False)
    print(f"‚úÖ Finished {person_folder} successfully!\n")

print("üéâ All Businessmen folders processed and saved to Step2 successfully!")