In [9]:
import os
from dotenv import load_dotenv
import openai
import csv
import pandas as pd
from openai import OpenAI
import matplotlib.pyplot as plt
import fitz
import json

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("No OpenAI API key found. Check your .env file.")
openai.api_key = openai_api_key

In [4]:
def read_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [14]:
def get_pdf_files(folder_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]
    pdf_paths = [os.path.join(folder_path, file) for file in pdf_files]
    return pdf_paths

In [30]:
client = OpenAI()
global api_call_count 
api_call_count = 0

def analyze_text(text):
    global api_call_count
    #print("Text preview:", text[:100]) # Uncomment for debugging
    system_prompt = """
    You are provided with a set of Newspaper Articles about geothermal energy. 
    Your task is to analyze the sentiment of the articles to geothermal energy.
    Your task is to find every feature that is mentioned in the articles about geothermal energy and to analyze the sentiment of every feature.
    Only include information about geothermal energy.
    The sentiment can be positive, negative, or neutral.
    If the sentiment is positive, mention the reasons for the positive sentiment.
    If the sentiment is negative, mention the reasons for the negative sentiment.
    If the sentiment is neutral, mention the reasons for the neutral sentiment.
    If you are not sure about the sentiment, mention that you are not sure.
    Also include the name of the newspaper article and the date of the article.
    Ouput the sentiment in a json format.
    the json format should be as follows:
    {
        "newspaper_article1": {
            "date": "date1",
            "sentiment": {
                "feature1": "sentiment1",
                "feature2": "sentiment2",
                "feature3": "sentiment3",
                ...
            }
        },
        "newspaper_article2": {
            "date": "date2",
            "sentiment": {
                "feature1": "sentiment1",
                "feature2": "sentiment2",
                "feature3": "sentiment3",
                ...
            }
        },
        ...
    }
    Output nothing else.
    If you don't find any information about geothermal energy, output a json with just the newspaper article name and date.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text}
    ]

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.5,
            max_tokens=500,  
        )

        api_call_count += 1

        response_text = response.choices[0].message.content.strip()
        
        #print("Summary before parsing:", response_text)  # Uncomment for debugging

        return response_text

    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

In [17]:
def parse_json(response_text):
    try:
        if response_text and (response_text.startswith('{') or response_text.startswith('[')):
            return json.loads(response_text)
        else:
            print("Warning: No valid JSON returned.")
            return {}
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return {}

In [11]:
def flatten_data(parsed_json):
    flat_data = []
    for article, details in parsed_json.items():
        # Initialize with article name and date
        row = {"newspaper_article": article, "date": details["date"]}
        # Add each feature sentiment as a new column
        for feature, sentiment in details["sentiment"].items():
            row[feature] = sentiment
        flat_data.append(row)
    return flat_data

In [12]:
def create_dataframe(flat_data):
    return pd.DataFrame(flat_data)

In [33]:
def get_sentiment_of_newspapers(folder_path):
    pdf_paths = get_pdf_files(folder_path)  # Get all PDF files in the folder
    all_data = []
    json_data = []  # Initialize an empty list to store JSON objects

    for path in pdf_paths:
        text = read_pdf(path)
        response_text = analyze_text(text)
        parsed_json = parse_json(response_text)  # Parse the JSON string

        if parsed_json:  # Check if parsed_json is not empty
            json_data.append(parsed_json)  # Add the JSON object to the list
            flat_data = flatten_data(parsed_json)  # Flatten the data
            all_data.extend(flat_data)  # Add to our collective list
        else:
            print(f"Skipped file {path} due to parsing issues.")

    if all_data:
        df = create_dataframe(all_data)  # Create DataFrame
    else:
        print("No data to create DataFrame.")
        df = pd.DataFrame()  # Return an empty DataFrame if no data was processed

    return df, json_data  # Return both the DataFrame and the list of JSON objects



In [37]:
folder_path = '/Users/Zantana/repo/Dailys/24_02_20/Artikel'
df, json_data = get_sentiment_of_newspapers(folder_path)
print(df)

                                   newspaper_article        date Geothermie  \
0           Badische Zeitung vom 21.12.2010, Seite 1  21.12.2010   positive   
1                    Badische Zeitung vom 18.12.2010  18.12.2010   positive   
2           Badische Zeitung vom 21.12.2010, Seite 6  21.12.2010        NaN   
3          Badische Zeitung vom 08.12.2010, Seite 15  08.12.2010   not sure   
4                    Badische Zeitung vom 09.12.2010  09.12.2010        NaN   
5          Badische Zeitung vom 29.11.2010, Seite 25  29.11.2010        NaN   
6  Badische Zeitung vom 22.12.2010, Seite 2 / Das...  22.12.2010   negative   
7  Badische Zeitung vom 31.12.2010, Seite 31 / Lo...  31.12.2010        NaN   
8          Badische Zeitung vom 22.12.2010, Seite 33  22.12.2010        NaN   
9          Badische Zeitung vom 08.12.2010, Seite 29  08.12.2010   positive   

  Geothermiebohrungen geologische Gutachten  \
0                 NaN                   NaN   
1                 NaN               

In [39]:
df.head(2)

Unnamed: 0,newspaper_article,date,Geothermie,Geothermiebohrungen,geologische Gutachten,Energiegewinnung fürs Rathaus Staufen,Geothermiekraftwerk,Erdbeben,Gutachten,Landau,...,Bohrloch,Basler Geothermie-Projekt,IWB (Industrielle Werke Basel),Geopower Basel AG,Rückbau des ehemaligen Bohrplatzes,Verschließen des 5000 Meter tiefen Bohrlochs,Natürlicher Druckaufbau im Bohrloch,Mikroseismische Untersuchung,Einstellung des Projekts durch die Basler Regierung,"Erdbeben bis zur Stärke 3,4 ausgelöst durch Bohrungen"
0,"Badische Zeitung vom 21.12.2010, Seite 1",21.12.2010,positive,,,,,,,,...,,,,,,,,,,
1,Badische Zeitung vom 18.12.2010,18.12.2010,positive,,,,,,,,...,,,,,,,,,,


In [43]:
df.to_csv('geothermal_sentiment.csv', index=False)