## Master Thesis Data Collection

In [None]:
#import all the necessary packages to scrape the data

from tqdm import tqdm
import pandas as pd
from openai import OpenAI
import google.generativeai as genai
import os

### Hugging Face Pipeline Transformer - German Sentiment BERT

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert")

In [None]:
df = pd.read_excel("Master_Dataset_v1.xlsx")

In [None]:
df["GSentBERT_label"] = "pending"
df["GSentBERT_score"] = "pending"

In [None]:
for row in tqdm(range(len(df.Text))):
    if df.GFinBERT_label[row] == "pending":
        try:
            score = sent_pipeline(df.Text[row][:512])
            df.loc[row, "GFinBERT_label"] = score[0]["label"]
            df.loc[row, "GFinBERT_score"] = score[0]["score"]
        except :
            pass
    else:
        pass

In [None]:
df.head()

In [None]:
df.to_excel("Master_Dataset_v2.xlsx", index = False)

### Hugging Face Pipeline Transformer - German FinBERT

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("text-classification", model="scherrmann/GermanFinBert_SC_Sentiment")

In [None]:
df = pd.read_excel("/teamspace/studios/this_studio/Master_Tesis/Master_Dataset_v2.xlsx")

In [None]:
df["GFinBERT_label"] = "pending"
df["GFinBERT_score"] = "pending"

In [None]:
for row in tqdm(range(len(df.Text))):
    if df.GFinBERT_label[row] == "pending":
        try:
            score = sent_pipeline(df.Text[row][:512])
            df.loc[row, "GFinBERT_label"] = score[0]["label"]
            df.loc[row, "GFinBERT_score"] = score[0]["score"]
        except :
            pass
    else:
        pass

In [None]:
df.to_excel("Master_Dataset_v3.xlsx", index = False)

### Hugging Face Pipeline Transformer - FinBERT

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert",device=0)

In [None]:
df = pd.read_excel("/teamspace/studios/this_studio/Master_Tesis/Master_Dataset_v3.xlsx")

In [None]:
df["FinBERT_label"] = "pending"
df["FinBERT_score"] = "pending"

In [None]:
df.head()

In [None]:
for row in tqdm(range(len(df.Text))):
    if df.FinBERT_label[row] == "pending":
        try:
            score = pipe(df.Text[row][:512])
            df.loc[row, "FinBERT_label"] = score[0]["label"]
            df.loc[row, "FinBERT_score"] = score[0]["score"]
        except :
            pass
    else:
        pass

In [None]:
df.to_excel("Master_Dataset_v4.xlsx", index = False)

Open AI ChatGPT 3.5

In [None]:
df = pd.read_excel("Master_Dataset_v4.xlsx")

In [None]:
client = OpenAI(api_key="insert your key here")

In [None]:
df["GPT3_5_label"] = "pending"

In [None]:
for i in tqdm(range(len(df))):
    if df.GPT3_5_label[i] == "pending":
        time.sleep(1)
        
        try:
            role = """
                You are a helpful financial analyst. I will provide you with a financial news article in german and I want you to read it and give me your sentiment about it.
                    """

            instruction= """
            I will provide you with a financial news article in german and I want you to read it and give me your sentiment about it. You shold
            express if the provided information is positive, negative or neutral for the swiss stock market. If you think that the information is positive
            and the stock market will likely go up, I want your answer be BUY. If you think that the information is negative and the stock market will likely go down,
            I want your answer be SELL. If you think that the information is neutral and the stock market will likely stay the same, I want your answer be HOLD.
            In the case the information is not really relevant for the stock market, I want your answer to be HOLD.

            I'am going to use this information to cluster the news articles and construct a weekly sentiment index for the swiss stock market.

            I want your answer be exaclly one of the following options, just one word, dont add anything else:

            BUY, HOLD, SELL
                        """

            article_text = df.Text[i]
            completion = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            max_tokens=2,
            temperature=0.0,
            messages=[{"role": "system", "content": role},{"role": "user", "content": instruction+" "+article_text}])
            if completion.choices[0].message.content in ["BUY", "SELL", "HOLD", "Sell", "Buy", "Hold", "buy", "sell", "hold"]:
                df.loc[i, "GPT3_5_label"] = completion.choices[0].message.content
        except:
            pass

In [None]:
df.head()

In [None]:
df.to_excel("Master_Dataset_v5.xlsx", index = False)

### Gemini API

In [None]:
df = pd.read_excel("Master_Dataset_v5.xlsx")

In [None]:
df["Gemini_label"] = "pending"

In [None]:
genai.configure(api_key='insert your key here')

generation_config = {
  "temperature": 0,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 2,
}

model = genai.GenerativeModel(model_name="gemini-1.0-pro", generation_config=generation_config,)


role = """
You are a helpful financial analyst. I will provide you with a financial news article in german and I want you to read it and give me your sentiment about it.
        """
instruction= """
I will provide you with a financial news article in german and I want you to read it and give me your sentiment about it. You shold
express if the provided information is positive, negative or neutral for the swiss stock market. If you think that the information is positive
and the stock market will likely go up, I want your answer be BUY. If you think that the information is negative and the stock market will likely go down,
I want your answer be SELL. If you think that the information is neutral and the stock market will likely stay the same, I want your answer be HOLD.
In the case the information is not really relevant for the stock market, I want your answer to be HOLD.

I'am going to use this information to cluster the news articles and construct a weekly sentiment index for the swiss stock market.

I want your answer be exaclly one of the following options, just one word, dont add anything else:

BUY, HOLD, SELL
                """

for i in tqdm(range(25000, len(df.Gemini_label))):
    if df.Gemini_label[i] == "pending":
        try:  
          article_text = df.Text[i]
          response = model.generate_content(f'{role} \n {instruction}\n{article_text}')
          if response.text in ["BUY", "SELL", "HOLD", "Sell", "Buy", "Hold", "buy", "sell", "hold"]:
              df.loc[i, "Gemini_label"] = response.text
          time.sleep(0.8)
        except:
            pass

In [None]:
sum(df.Gemini_label.isin(["HOLD", "BUY", "SELL"]))

In [None]:
df.to_excel("Master_Dataset_v6.xlsx", index = False)

### SwissFinBERT

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="AlGatone21/SwissFinBERT", device=0)

In [None]:
pipe("Schweizer Aktienmarkt sinkt um 2%, die Anleger sind sehr sehr pessimistisch")

In [None]:
df = pd.read_excel("Master_Dataset_v6.xlsx")

In [None]:
df["SwissFinBERT_label"] = "pending"
df["SwissFinBERT_score"] = "pending"

In [None]:
for row in tqdm(range(len(df.Text))):
    if df.SwissFinBERT_label[row] == "pending":
        try:
            score = pipe(df.Text[row][:512])
            df.loc[row, "SwissFinBERT_label"] = score[0]["label"]
            df.loc[row, "SwissFinBERT_score"] = score[0]["score"]
        except :
            pass
    else:
        pass

In [None]:
label_counts = df['SwissFinBERT_label'].value_counts()
print(label_counts)

In [None]:
df.columns

In [None]:
label_counts = df['GFinBERT_label'].value_counts()
print(label_counts)

In [None]:
df.to_excel("Master_Dataset_v15.xlsx", index = False)

### Unify Formats

In [None]:
df = pd.read_excel("Master_Dataset_v15.xlsx")

In [None]:
df = df[["Date", "GSebtBERT_label", "GFinBERT_label", "FinBERT_label", "GPT3_5_label", "Gemini_label", "SwissFinBERT_label", "smi_nlabel_t1"]]
df.head()

In [None]:
label_matchings = { "BUY" : 2, "HOLD" : 1, "SELL" : 0, "positive" : 2, "neutral" : 1, "negative" : 0, "pending" : 1, "Negativ" : 0, "Neutral" : 1, "Positiv" : 2}

df["GSebtBERT_label"] = df["GSebtBERT_label"].map(label_matchings)
df["GFinBERT_label"] = df["GFinBERT_label"].map(label_matchings)
df["FinBERT_label"] = df["FinBERT_label"].map(label_matchings)
df["GPT3_5_label"] = df["GPT3_5_label"].map(label_matchings)
df["Gemini_label"] = df["Gemini_label"].map(label_matchings)
df["SwissFinBERT_label"] = df["SwissFinBERT_label"].map(label_matchings)

df.head()

In [None]:
df.rename(columns={"GSebtBERT_label": "GSentBERT_label"}, inplace=True)
df.head()