# Deepseek for Sentiment Analysis

In [1]:
import ollama
from ollama import chat
from ollama import ChatResponse
import time
import datetime
import tiktoken
import re
import kagglehub
import os
import pandas as pd
from tqdm import tqdm

In [2]:
# Might need to step away and keep PC awake if data volume is too large:
import ctypes

# Prevent the system from sleeping
ES_CONTINUOUS = 0x80000000
ES_SYSTEM_REQUIRED = 0x00000001
ctypes.windll.kernel32.SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED)

-2147483648

In [3]:
def ask_deepseek(input_content, system_prompt, deep_think = True, print_log = True):
    response: ChatResponse = chat(model='deepseek-r1:7b',
                                  messages=[
                                      {'role' : 'system','content' : system_prompt},
                                      {'role' : 'user','content' : input_content}
                                      ]
                                 )
    response_text = response['message']['content']
    if print_log: print(response_text)
    #Extract and separate Deepthink:
    think_texts = re.findall(r'<think>(.*?)</think>', response_text, flags=re.DOTALL)
    think_texts = "\n\n".join(think_texts).strip()
    #Exclude Deep Think and return response
    clean_response = re.sub(r'<think>.*?</think>','', response_text, flags=re.DOTALL).strip()
    #Return either response or tuple with response and deep think
    return clean_response if not deep_think else (clean_response, think_texts)

In [4]:
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")

In [5]:
data_labeled = pd.read_csv(path+'\\'+'all-data.csv', encoding='latin1', header=None, names=['sentiment', 'headline'])

In [6]:
data_labeled

Unnamed: 0,sentiment,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [7]:
headline_data = pd.DataFrame(data_labeled['headline'])

In [8]:
headline_data

Unnamed: 0,headline
0,"According to Gran , the company has no plans t..."
1,Technopolis plans to develop in stages an area...
2,The international electronic industry company ...
3,With the new production plant the company woul...
4,According to the company 's updated strategy f...
...,...
4841,LONDON MarketWatch -- Share prices ended lower...
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,Operating profit fell to EUR 35.4 mn from EUR ...
4844,Net sales of the Paper segment decreased to EU...


system_prompt_sentiment = '''
You are a professional financial analyst in charge of guiding investment decisions based on news headlines.
You will be provided with several headlines. For each headline, provide the following:

<ID=n>
Label: [positive | negative | neutral]
<think>One-sentence explanation for your decision.</think>

Only use this format. Repeat for each ID.
'''

#feed headlines to deepseek using apply
headline_data[['LLM_sentiment', 'LLM_deep_think']] = headline_data['headline'].head(200).apply(lambda x: ask_deepseek(x,
                                                                                                            system_prompt_sentiment,
                                                                                                            print_log=False)).apply(pd.Series)

#### Using row-by-row Pandas apply is extremely slow an inefficient, <br> best to batch multiple headlines together and ask Deepseek to evaluate parsed batches

In [9]:
# Need to batch headlines, assign a Unique ID to each
def build_batched_prompt(headlines, start_index=0):
    formatted = []
    for i, headline in enumerate(headlines, start=start_index):
        formatted.append(f"<ID={i}> {headline.strip()}")
    return "\n".join(formatted)

In [10]:
def parse_deepseek_response(response_text, batch_size):
    labels = [None] * batch_size
    reasons = [None] * batch_size

    pattern = r"<ID=(\d+)>\s*Label:\s*(\w+)\s*Explanation:\s*(.+?)(?:\n\n|$)"
    for match in re.finditer(pattern, response_text, re.DOTALL | re.IGNORECASE):
        idx, label, reason = match.groups()
        idx = int(idx)
        if 0 <= idx < batch_size:
            labels[idx] = label.strip().lower()
            reasons[idx] = reason.strip()

    return labels, reasons

In [11]:
def batch_sentiment_analysis(df, text_col, system_prompt, batch_size=10, debug=False):
    all_labels = []
    all_reasons = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch = df[text_col].iloc[i:i+batch_size].tolist()
        input_text = build_batched_prompt(batch, start_index=i)

        response, _ = ask_deepseek(input_text, system_prompt, deep_think=True, print_log=False)

        if debug and i < batch_size * 2:
            print(f"\n--- Raw Response for Batch starting at index {i} ---")
            print(response)
            print("\n--- End of Raw Response ---\n")

        labels, reasons = parse_deepseek_response(response, len(batch))
        all_labels.extend(labels)
        all_reasons.extend(reasons)

    df['LLM_sentiment'] = all_labels
    df['LLM_deep_think'] = all_reasons
    return df

In [12]:
system_prompt_sentiment = """
You are a professional financial analyst classifying financial news headlines.

Each headline is preceded by an ID in the format <ID=n>.

For each headline, provide:
- A sentiment label from this list: [positive | negative | neutral]
- A one-sentence explanation starting with 'Explanation:'.

VERY IMPORTANT:
- Do NOT summarize the headlines.
- Do NOT explain your output overall.
- Do NOT group headlines together.
- Respond ONLY using the format below:

<ID=0>
Label: positive
Explanation: This headline indicates improving revenue outlook.

<ID=1>
Label: neutral
Explanation: The announcement does not significantly impact financials.

<ID=2>
Label: negative
Explanation: The headline reports declining profits.

Repeat for each headline provided, using this format exactly.
Do not skip any headline.
"""

In [13]:
results_df = batch_sentiment_analysis(headline_data.head(60),
                                      'headline', 
                                      system_prompt_sentiment, 
                                      batch_size=10, 
                                      debug=True)

 17%|██████████████                                                                      | 1/6 [00:14<01:12, 14.57s/it]


--- Raw Response for Batch starting at index 0 ---
<ID=0>
Label: neutral
Explanation: The company plans to retain production in Russia but is growing there, indicating cautiousness without clear direction.

<ID=1>
Label: positive
Explanation: Technopolis's expansion strategy suggests potential growth and development.

<ID=2>
Label: negative
Explanation: layoffs indicate workforce reduction which could affect productivity or morale.

<ID=3>
Label: positive
Explanation: Increased production capacity aligns with demand, enhancing efficiency and profitability.

<ID=4>
Label: positive
Explanation: A strategic plan with expected sales growth indicates business expansion.

<ID=5>
Label: positive
Explanation: Aggressive pursuit of high-tech products suggests growth potential in the electronics industry.

<ID=6>
Label: positive
Explanation: Doubling sales and achieving zero pre-tax profit from a loss shows improvement and profitability.

<ID=7>
Label: positive
Explanation: Third-quarter growth

 33%|████████████████████████████                                                        | 2/6 [00:36<01:15, 18.80s/it]


--- Raw Response for Batch starting at index 10 ---
<ID=10> Label: Positive  
Explanation: The offer aligns with TeliaSonera's strategy to strengthen Eesti Telekom's customer offerings.  

<ID=11> Label: Positive  
Explanation: Credit Suisse First Boston raised the fair value for shares in four Nordic forestry groups, indicating positive market expectations.  

<ID=12> Label: Neutral  
Explanation: A purchase agreement for gasoline with a specified price increase lacks significant sentiment indication.  

<ID=13> Label: Positive  
Explanation: Finnish Talentum reported an increase in operating profit and net sales, signaling financial growth.  

<ID=14> Label: Positive  
Explanation: Sepp+Æl+Æ's clothing retail chain saw increased sales and operating profit compared to 2004.  

<ID=15> Label: Positive  
Explanation: The company moved from a prior loss to an increase in net sales, indicating improved financial health.  

<ID=16> Label: Positive  
Explanation: Both divisions reported in

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:41<00:00, 16.89s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LLM_sentiment'] = all_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LLM_deep_think'] = all_reasons


In [14]:
# Restore default sleep behavior
ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)

-2147483647

In [16]:
results_df.head(10)

Unnamed: 0,headline,LLM_sentiment,LLM_deep_think
0,"According to Gran , the company has no plans t...",neutral,The company plans to retain production in Russ...
1,Technopolis plans to develop in stages an area...,positive,Technopolis's expansion strategy suggests pote...
2,The international electronic industry company ...,negative,layoffs indicate workforce reduction which cou...
3,With the new production plant the company woul...,positive,Increased production capacity aligns with dema...
4,According to the company 's updated strategy f...,positive,A strategic plan with expected sales growth in...
5,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,positive,Aggressive pursuit of high-tech products sugge...
6,"For the last quarter of 2010 , Componenta 's n...",positive,Doubling sales and achieving zero pre-tax prof...
7,"In the third quarter of 2010 , net sales incre...",positive,Third-quarter growth in net sales and operatin...
8,Operating profit rose to EUR 13.1 mn from EUR ...,positive,Operating profit rise over several quarters in...
9,"Operating profit totalled EUR 21.1 mn , up fro...",positive,Continued increase in operating profits shows ...
