In [None]:
%pip install -r "../requirements.txt"

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
!sudo nvidia-smi --gpu-reset -i 0
!nvidia-smi

In [None]:
import torch

torch.cuda.reset_max_memory_allocated()

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
!pip install transformers==4.40.1 peft==0.4.0
!pip install sentencepiece
!pip install accelerate
!pip install torch
!pip install peft
!pip install datasets
!pip install bitsandbytes
!pip install numpy pandas scipy


In [None]:
%pip install dotenv
%pip install accelerate

# Tweeter Roberta Sentiment Analysis

In [None]:
import torch
torch.cuda.empty_cache()


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
import gc

def print_gpu_memory():
    allocated = torch.cuda.memory_allocated() / (1024**2)
    cached = torch.cuda.memory_reserved() / (1024**2)
    print(f"Allocated: {allocated:.2f} MB")
    print(f"Cached: {cached:.2f} MB")

# Before clearing the cache
print("Before clearing cache:")
print_gpu_memory()

# Clearing cache
gc.collect()
torch.cuda.empty_cache()

# After clearing the cache
print("\nAfter clearing cache:")
print_gpu_memory()

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


In [None]:
!pip install colorama

In [None]:
from concurrent.futures import ThreadPoolExecutor
from colorama import Fore, Style, init
import pandas as pd
import os
import time
import warnings
from transformers import pipeline

In [None]:
init(autoreset=True)

In [None]:
def load_model(model):
    return AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [None]:
# Global Constants
INPUT_FILE = "../Data Collection/data.csv"
OUTPUT_FILE = "tweeter_roberta_sentiment_analysis_data.csv"
NUM_PROCESSES = 1

sentiment_model = load_model(MODEL)

def find_sentiment(text):
  # try:
  #   print(f"{Fore.GREEN} Input:- {text[:20]}{Style.RESET_ALL}")
  #   result = sentiment_model(text)
  #   print(f"{Fore.GREEN} Output:- {result}{Style.RESET_ALL}")
  #   return list((text, result[0]["label"], result[0]["score"]))
  # except Exception as e:
  #   print(f"{Fore.RED} Error: {e}{Style.RESET_ALL}")
  #   return list((text, np.nan, np.nan))

  try:
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # sort the ranks
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    label = config.id2label[ranking[0]]
    sentiment_score = scores[ranking[0]]

    print(f"{Fore.GREEN} Input:- {text[:20]}{Style.RESET_ALL}")

    print(f"{Fore.GREEN} Output:- label = {label}, sentiment_score = {sentiment_score} {Style.RESET_ALL}")

    return list((text, label, sentiment_score))

    # for i in range(scores.shape[0]):
    #     l = config.id2label[ranking[i]]
    #     s = scores[ranking[i]]
    #     print(f"{i+1}) {l} {np.round(float(s), 4)}")
  except Exception as e:
    print(f"{Fore.RED} Error: {e}{Style.RESET_ALL}")
    return list((text, np.nan, np.nan))



def process_texts_in_parallel(texts):
    """
        process sentiment from the texts in paprallel
    """
    print(f"{Fore.CYAN}üîπ Using {NUM_PROCESSES} parallel threads...{Style.RESET_ALL}")

    with ThreadPoolExecutor(NUM_PROCESSES) as executor:
        results = list(executor.map(find_sentiment, texts))

    return results


def sentiment_analysis_of_text_data():
    if not os.path.exists(INPUT_FILE):
        print(f"{Fore.RED}Error: '{INPUT_FILE}' not found!{Style.RESET_ALL}")
        return

    df = pd.read_csv(INPUT_FILE)

    if "News" not in df.columns:
        print(f"{Fore.RED}Error: 'News' column missing in CSV!{Style.RESET_ALL}")
        return

    texts = df["News"].dropna().tolist()

    # when the scrapping stops unexpectedly
    already_processed_texts = set()
    if os.path.exists(OUTPUT_FILE):
        already_processed_texts = set(
            pd.read_csv(OUTPUT_FILE)["News"].dropna().tolist()
        )

    # filter the links if they are already processed

    links = [text for text in texts if text not in already_processed_texts]

    # Split into batches to avoid excessive memory usage
    batch_size = 500
    total_batches = (len(texts) // batch_size) + 1

    all_results = []

    for i in range(total_batches):
        batch_texts = texts[i * batch_size: (i + 1) * batch_size]
        print(f"{Fore.YELLOW} Processing batch {i + 1}/{total_batches} ({len(batch_texts)} links){Style.RESET_ALL}")

        results = process_texts_in_parallel(batch_texts)  # Removed `[0]`
        all_results.extend(results)

        # Save intermediate results
        temp_df = pd.DataFrame(all_results, columns=["News", "sentiment_label", "sentiment_score"])
        temp_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")

    print(f"{Fore.GREEN}‚úîÔ∏è Sentiment Analysis complete! Data saved to '{OUTPUT_FILE}'{Style.RESET_ALL}")

In [None]:
if __name__ == "__main__":
   sentiment_analysis_of_text_data()