# Deepseek for Sentiment Analysis

In [1]:
import ollama
from ollama import chat
from ollama import ChatResponse
import time
import datetime
import tiktoken
import re
import kagglehub
import os
import pandas as pd
from tqdm import tqdm

In [2]:
# Might need to step away and keep PC awake if data volume is too large:
import ctypes

# Prevent the system from sleeping
ES_CONTINUOUS = 0x80000000
ES_SYSTEM_REQUIRED = 0x00000001
ctypes.windll.kernel32.SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED)

-2147483648

In [3]:
def ask_deepseek(input_content, system_prompt, llm_model = 'deepseek-r1:7b', deep_think = True, print_log = True):
    response: ChatResponse = chat(model=llm_model, #type hinting, ChatResponse class
                                  messages=[
                                      {'role' : 'system','content' : system_prompt},
                                      {'role' : 'user','content' : input_content}
                                      ]
                                 )
    response_text = response['message']['content']
    if print_log: print(response_text)
    #Extract and separate Deepthink:
    think_texts = re.findall(r'<think>(.*?)</think>', response_text, flags=re.DOTALL) #find anything inside <think></think>, even accross multiple lines
    think_texts = "\n\n".join(think_texts).strip() #join all extracted <think> sections together
    #Exclude Deep Think and return response
    clean_response = re.sub(r'<think>.*?</think>','', response_text, flags=re.DOTALL).strip()
    #Return either response or tuple with response and deep think
    return clean_response if not deep_think else (clean_response, think_texts)

In [4]:
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")

In [5]:
data_labeled = pd.read_csv(path+'\\'+'all-data.csv', encoding='latin1', header=None, names=['sentiment', 'headline'])

In [6]:
data_labeled

Unnamed: 0,sentiment,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [7]:
headline_data = pd.DataFrame(data_labeled['headline'])

In [8]:
headline_data

Unnamed: 0,headline
0,"According to Gran , the company has no plans t..."
1,Technopolis plans to develop in stages an area...
2,The international electronic industry company ...
3,With the new production plant the company woul...
4,According to the company 's updated strategy f...
...,...
4841,LONDON MarketWatch -- Share prices ended lower...
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,Operating profit fell to EUR 35.4 mn from EUR ...
4844,Net sales of the Paper segment decreased to EU...


system_prompt_sentiment = '''
You are a professional financial analyst in charge of guiding investment decisions based on news headlines.
You will be provided with several headlines. For each headline, provide the following:

<ID=n>
Label: [positive | negative | neutral]
<think>One-sentence explanation for your decision.</think>

Only use this format. Repeat for each ID.
'''

#feed headlines to deepseek using apply
headline_data[['LLM_sentiment', 'LLM_deep_think']] = headline_data['headline'].head(200).apply(lambda x: ask_deepseek(x,
                                                                                                            system_prompt_sentiment,
                                                                                                            print_log=False)).apply(pd.Series)

#### Using row-by-row Pandas apply is extremely slow an inefficient, <br> best to batch multiple headlines together and ask Deepseek to evaluate parsed batches

In [9]:
# Need to batch headlines, assign a Unique ID to each
def build_batched_prompt(headlines, start_index=0):
    formatted = []
    for i, headline in enumerate(headlines, start=start_index): #loop over headlines, numeric IDs
        formatted.append(f"<ID={i}> {headline.strip().replace('\n', ' ')}")
    return "\n".join(formatted) #single string with formatted headlines separated by newlines

In [10]:
def parse_deepseek_response(response_text, batch_size, start_index=0):
    labels = [None] * batch_size #create list of len batch_size to hold our values
    reasons = [None] * batch_size

    pattern = r"<ID=(\d+)>\s*Label:\s*(\w+)\s*Explanation:\s*(.+?)(?:\n\n|$)"
    for match in re.finditer(pattern, response_text, re.DOTALL | re.IGNORECASE):
        global_idx, label, reason = match.groups() # Extract 3 matched parts from regex
        idx = int(global_idx) - start_index  # Convert global ID to batch-local index
        if 0 <= idx < batch_size: #safety check, only process if local index is within the batch
            labels[idx] = label.strip().lower() #clean up and store sentiment in respective position
            reasons[idx] = reason.strip() #same goes for reason

    return labels, reasons #return two lists to be appended back into master reesults

In [15]:
def batch_sentiment_analysis(df, text_col, system_prompt, batch_size=10, debug=False):
    all_labels = []
    all_reasons = []

    for i in tqdm(range(0, len(df), batch_size)): #generate start indeces for each batch, progress bar
        batch = df[text_col].iloc[i:i+batch_size].tolist()
        input_text = build_batched_prompt(batch, start_index=i) #generate single string with formatted headlines separated by newlines

        # We extract only the response output and discard <think></think>
        # But there may be use for <think></think> in later implementations
        response, _ = ask_deepseek(input_text, system_prompt, deep_think=True, print_log=False)

        if debug and i < batch_size * 2: # print raw LLM response for the first two batches for debugging
            print(f"\n--- Raw Response for Batch starting at index {i} ---")
            print(response)
            print("\n--- End of Raw Response ---\n")

        labels, reasons = parse_deepseek_response(response, len(batch), start_index=i)
        all_labels.extend(labels)
        all_reasons.extend(reasons)

    df['LLM_sentiment'] = all_labels
    df['LLM_deep_think'] = all_reasons
    return df

In [12]:
system_prompt_sentiment = """
You are a professional financial analyst classifying financial news headlines.

Each headline is preceded by an ID in the format <ID=n>.

For each headline, provide:
- A sentiment label from this list: [positive | negative | neutral]
- A one-sentence explanation in English, starting with 'Explanation:'.

VERY IMPORTANT:
- Do NOT summarize the headlines.
- Do NOT explain your output overall.
- Do NOT group headlines together.
- Respond ONLY using the format below:

<ID=0>
Label: positive
Explanation: This headline indicates improving revenue outlook.

<ID=1>
Label: neutral
Explanation: The announcement does not significantly impact financials.

<ID=2>
Label: negative
Explanation: The headline reports declining profits.

Repeat for each headline provided, using this format exactly.
Do not skip any headline.
"""

In [13]:
results_df = batch_sentiment_analysis(headline_data,
                                      'headline', 
                                      system_prompt_sentiment, 
                                      batch_size=10, 
                                      debug=True)

  0%|▏                                                                               | 1/485 [00:18<2:30:33, 18.66s/it]


--- Raw Response for Batch starting at index 0 ---
<ID=0>
Label: neutral
Explanation: The company has no plans to move production to Russia despite its growth there.

<ID=1>
Label: positive
Explanation: Technopolis is developing a new tech area, signaling expansion potential.

<ID=2>
Label: neutral
Explanation: Elcoteq laid off employees but scaled back office roles, without clear financial impact.

<ID=3>
Label: positive
Explanation: The company aims to boost capacity and profitability through new production.

<ID=4>
Label: positive
Explanation: Basware targets sales growth with improved margins, positioning for expansion.

<ID=5>
Label: neutral
Explanation: Aspocomp focuses on advanced PCBs, likely a strategic move without direct profit impact.

<ID=6>
Label: positive
Explanation: Componenta's sales doubled last quarter, showing strong performance.

<ID=7>
Label: positive
Explanation: Both net sales and operating profit rose in the third quarter, indicating growth.

<ID=8>
Label: po

  0%|▎                                                                               | 2/485 [00:34<2:16:56, 17.01s/it]


--- Raw Response for Batch starting at index 10 ---
<ID=10>
Label: positive
Explanation: TeliaSonera's offer aligns with its strategy to enhance ownership of core business holdings, suggesting a favorable outcome.

<ID=11>
Label: positive
Explanation: The valuation increase for STORA ENSO and other forestry groups reflects higher investor confidence in their performance.

<ID=12>
Label: neutral
Explanation: A purchase agreement for gasoline at the Platts index indicates routine business activity without significant market impact.

<ID=13>
Label: positive
Explanation: Finnish Talentum reports increased operating profit and net sales, indicating improved financial health.

<ID=14>
Label: neutral
Explanation: The clothing retail chain's sales growth shows moderate success but lacks dramatic implications for the broader market.

<ID=15>
Label: positive
Explanation: The net sales increase and move from a loss to an operating profit signal strong financial improvement.

<ID=16>
Label: posit

 55%|█████████████████████████████████████████▌                                  | 265/485 [1:29:32<1:14:20, 20.27s/it]


KeyboardInterrupt: 

In [None]:
# Restore default sleep behavior
ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)

In [None]:
results_df.head(20)