Install Modules

In [6]:
!pip install -U datasets
!pip install wandb
!pip install torch
!pip install Cython

!pip install torch torchvision torchaudio
!pip install -U transformers
!pip install peft
!pip install -U bitsandbytes
!pip install tensorboard
!pip install accelerate -U
!pip install pandas
!pip install tqdm
!pip install paramiko scp
!pip install llama-cpp-python

Load a Model to Memory

In [3]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

from peft import (
    PeftModel,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

model = None
tokenizer = None
def load_to_gpu():
    base_model = "mistralai/Mistral-7B-v0.1"
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

    model = PeftModel.from_pretrained(model, "dohonba/mistral_7b_fingpt")
    model.eval()
    
    return model, tokenizer

def load_llamacpp():
    from llama_cpp import Llama
    llm = Llama(model_path=r"C:\Users\bow33\models\mistral_7b_v0.2_fingpt_Q8_0.gguf", n_gpu_layers=500, n_ctx=3584, n_batch=521, verbose=True)
    return llm
    
# Choose a model to use
llm = load_llamacpp()
# model, tokenizer = load_to_gpu()

llama_model_loader: loaded meta data with 18 key-value pairs and 291 tensors from C:\Users\bow33\models\mistral_7b_v0.2_fingpt_Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.

In [None]:
# eval_prompt = """
# Is this sentence self-promotional? Answer with {no/yes}? "Building brick by brick, our analysts motto! Pay a visit to our Community".
# """

# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# with torch.no_grad():
#     output = model.generate(**model_input, max_new_tokens=150)[0]
#     decoded_output = tokenizer.decode(output, skip_special_tokens=True)

# print(decoded_output)

In [4]:
# Function to classify emotion of a sentence
mode = "llamacpp"

def generate(eval_prompt):
    answer = None
    if mode == "gpu":
        model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = model.generate(**model_input, max_new_tokens=150)[0]
            decoded_output = tokenizer.decode(output, skip_special_tokens=True)

        answer = decoded_output.split("Answer: ", 1)[1]
    elif mode == "llamacpp":
        output = llm(eval_prompt, max_tokens=12, echo=True)
        
        answer = output['choices'][0]['text']
        answer = answer.split("Answer: ", 1)[1]
    return answer

def classify_sentiment(sentence):
    eval_prompt = f"""Context: {sentence}

Question: 'What is the sentiment of this sentence? Please choose an answer from {{strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}}.'

Answer: 
"""

    answer = generate(eval_prompt)
    return answer  # You might need to further process this to extract the emotion

def classify_emotion(sentence):
    eval_prompt = f"""Context: {sentence}

Question: 'What is the emotion shown in this text? Please choose an answer from {{anger/fear/joy/love/sadness/surprise/neutral}}'.

Answer:
"""
    
    answer = generate(eval_prompt)
    return answer  # You might need to further process this to extract the emotion

In [None]:
# classify_emotion("I love it. Thanks.")

In [None]:
import paramiko
from scp import SCPClient

def create_ssh_client(server, port, user, password):
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(server, port, user, password, compress=True)
    return client

def upload_files(ssh_client, local_path, remote_path):
    with SCPClient(ssh_client.get_transport()) as scp:
        scp.put(local_path, remote_path)  # Use put for uploading

def append_file(ssh_client, local_path, remote_path, temp_path):
    # Step 1: Transfer the file to a temporary location
    upload_files(ssh_client, local_path, temp_path)

    temp_path = temp_path.replace("/", "\\")
    remote_path = remote_path.replace("/", "\\")

    # Step 2: Append the content of the temporary file to the target file
    command = f'type {temp_path} >> {remote_path} & del {temp_path}'
    stdin, stdout, stderr = ssh_client.exec_command(command)
    exit_status = stdout.channel.recv_exit_status()  # Wait for the command to complete
    
    # Reading the output of the command
    output = stdout.read().decode('utf-8')
    error = stderr.read().decode('utf-8')

    # Check if command was successful
    if exit_status == 0:
        print("Command executed successfully")
    else:
        print(f"Command failed with exit status {exit_status}")

    # Optional: Print the outputs for debugging or logging
    if output:
        print("Output:", output)
        
    if error:
        print("Error:", error)

    return not exit_status

def download_files(ssh_client, remote_path, local_path):
    with SCPClient(ssh_client.get_transport()) as scp:
        scp.get(remote_path, local_path)

# Optional: Execute a command or run a script on the remote machine
# stdin, stdout, stderr = ssh_client.exec_command('python /path/to/remote/script.py')
# print(stdout.read().decode())  # Assuming the script has output

def close_client(ssh_client):
    # Close the SSH connection
    ssh_client.close()

ModuleNotFoundError: No module named 'paramiko'

In [None]:
import json
from datetime import datetime, timedelta
import pandas as pd
import pytz
from tqdm import tqdm
import time
from dateutil import parser

server = 'sshhop.hopto.org'
port = 22
user = 'mum'
password = '1234'

ticker = "TSLA"

def download_tweets(file_path):
    ssh_client = create_ssh_client(server, port, user, password)
    download_files(ssh_client, f'/C:/Users/Mum/Documents/news_aggregation_ipynb/{file_path}', file_path)
    close_client(ssh_client)

def upload_processed_tweets(localprocessed_file_name, final_tweets_file_name):
    ssh_client = create_ssh_client(server, port, user, password)
    final_tweets_file_path = f"C:/Users/Mum/Documents/news_aggregation_ipynb/{final_tweets_file_name}"
    temp_final_tweets_file_path = f"C:/Users/Mum/Documents/news_aggregation_ipynb/temp_{final_tweets_file_name}"
    print(final_tweets_file_path)
    print(temp_final_tweets_file_path)
    result = append_file(ssh_client, f'./{localprocessed_file_name}', final_tweets_file_path, temp_final_tweets_file_path)
    # Delete the file
    close_client(ssh_client)
    return result

def compare_tweets_and_return_new(tweets_file_path, final_tweets_file_path):
    # Download recent tweets
    download_tweets(tweets_file_path)
    download_tweets(final_tweets_file_path)

    # Load IDs from the final tweets file
    final_tweet_ids = set()
    with open(final_tweets_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            final_tweet_ids.add(tweet['id'])

    # Load tweets from the initial file and filter out those that exist in the final file
    new_tweets = []
    with open(tweets_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tweet = json.loads(line)
            if tweet['id'] not in final_tweet_ids:
                # Aggregate created_at and date, then convert to datetime
                post_time = tweet.get('created_at') or tweet.get('date')
                tweet['post_time'] = parser.parse(post_time)
                new_tweets.append(tweet)

    new_tweets.sort(key=lambda x: x['post_time'], reverse=True)
    return new_tweets

def save_processed_tweets(localprocessed_file_path, tweets):
    with open(localprocessed_file_path, 'w', encoding='utf-8') as file:
        for tweet in tweets:
            # Create a new dictionary with only the required fields
            filtered_tweet = {
                'id': tweet['id'],  # Assuming each tweet has a unique 'id'
                'date': tweet.get('date', tweet.get('created_at', '')),
                'cleanContent': tweet.get('cleanContent', ''),
                'rawContent': tweet.get('rawContent', ''),
                'url': tweet.get('url', ''),
                'emotion': tweet.get('emotion', ''),
                'sentiment': tweet.get('sentiment', '')
            }
            json.dump(filtered_tweet, file)
            file.write('\n')

# Calculate the cutoff time for the last 30 minutes
# cutoff_time = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(minutes=30)

twitter_file_path = f'{ticker}_tweets.jsonl'
final_twitter_file_path = f'final_{twitter_file_path}'
stocktweets_file_path = f'{ticker}_stocktweets.jsonl'
final_stocktweets_file_path = f'final_{stocktweets_file_path}'

def process_and_save_tweets(tweets, ticker, final_tweets_file_path, localprocessed_file_path):
    sentiment_results = []
    emotion_results = []
    counter = 0
    to_save = []

    for i, tweet in enumerate(tqdm(tweets, desc="Analyzing Tweets")):
        clean_content = tweet.get('cleanContent', '')
        
        sentiment = classify_sentiment(clean_content)
        emotion = classify_emotion(clean_content)
        
        sentiment_results.append(f"{i}: " + sentiment)
        emotion_results.append(f"{i}: " + emotion)
        
        tweet['sentiment'] = sentiment
        tweet['emotion'] = emotion
        to_save.append(tweet)
    
        counter += 1
        # Save every 10 tweets or on the last tweet
        if counter % 20 == 0 or i == len(tweets) - 1:
            save_processed_tweets(localprocessed_file_path, to_save)
            result = upload_processed_tweets(localprocessed_file_path, final_tweets_file_path)    
            if result:
                print(f"Saved up to tweet {i+1}")
                to_save = []  # Reset the list for the next batch

    # Print results
    print(sentiment_results)
    print(emotion_results)
    for i, tweet in enumerate(tweets):
        print(f"{i}: " + tweet.get('cleanContent', ''))
    
    for i, tweet in enumerate(stocktweets):
        print(f"{i}: " + tweet.get('cleanContent', ''))

counter = 0
while True:
    print("downloading")
    tweets = compare_tweets_and_return_new(twitter_file_path, final_twitter_file_path)
    print("downloaded")
    print("downloading")
    stocktweets = compare_tweets_and_return_new(stocktweets_file_path, final_stocktweets_file_path)
    print("downloaded")

    print("Twitter:", ticker)
    process_and_save_tweets(tweets, ticker, final_twitter_file_path, f'processed_{twitter_file_path}')
    print("Stocktwits:", ticker)
    process_and_save_tweets(stocktweets, ticker, final_stocktweets_file_path, f'processed_{twitter_file_path}')

    counter += 1
    print(f"+-+-+-+-+-+-+-+-+-+-Cycles Completed: {counter}+-+-+-+-+-+-+-+-+-+-")
    for i in range(60):
        time.sleep(1)



Debug

In [None]:
article = NewsPlease.from_url('https://finance.yahoo.com/news/alaska-airlines-begun-flying-boeing-150009733.html')

# Split the article text into sentences
sentences = sent_tokenize(article.maintext)
print("Sentences in the article: ", len(sentences))

# Classify emotion for each sentence with a progress bar
emotion_results = []
for i, sentence in enumerate(tqdm(sentences, desc="Processing Sentences")):
    emotion = classify_sentiment(sentence)
    emotion_results.append(emotion)

# Do something with the results
print(emotion_results)

# print(article.maintext)