In [1]:
# If required

# !pip install selenium

In [2]:
#STEP_1
# Webscraping lists of earnings call transcripts id data for later use
# Accesses SeekingAlpha API endpoint. Transcripts data presents as list of JSON objects.

from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
import json

from random import randint

# Initialize Chrome WebDriver
driver = webdriver.Chrome()

# Loop through pages
# Enter page range. Caution: Limit range to prevent blocking
for page_number in range(100,105):

    # SeekingAlpha API endpoint
    # Construct URL
    url = f"https://seekingalpha.com/api/v3/articles?filter[category]=earnings%3A%3Aearnings-call-transcripts&filter[since]=0&filter[until]=0&include=author%2CprimaryTickers%2CsecondaryTickers&isMounting=true&page[size]=50&page[number]={page_number}"

    # Navigate to the URL
    driver.get(url)
    time.sleep(randint(4, 9))  # Adjust this delay as needed to ensure the page loads completely

    # Extract page content - JSON list
    page_content = driver.page_source
    soup = BeautifulSoup(page_content, 'html.parser')
    json_data = json.loads(soup.body.text)

    # Extract data and append to CSV
    with open('transcripts_list_data.csv', 'a', newline='') as csvfile:
        fieldnames = ['id', 'company_name', 'ticker', 'text', 'date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        for item in json_data['data']:
            id = item['id']
            title = item['attributes']['title']
            company_name, ticker, text = title.split('(')[0].strip(), \
                                          title.split('(')[1].split(')')[0], title.split('(')[1].split(')')[
                                              1].strip()
            date = item['attributes']['publishOn']

            writer.writerow(
                {'id': id, 'company_name': company_name, 'ticker': ticker,
                 'text': text, 'date': date})

    time.sleep(randint(4, 9))

print("Data has been extracted and written to 'transcripts_list_data.csv'.")



Data has been extracted and written to 'transcripts_list_data.csv'.


In [1]:
# Chat GPT Fri 26 Apr 16.25
# To achieve scraping in batches with reconnection between each batch,
# you can modify your code to close and reopen the Selenium WebDriver after each batch.
# Here's the modified version of your code:
# Back to line 1323 for the manual version.

# 15Jul. this works. A sample file 'transcript_data_test_urls_data.csv'... 
# ....cut down to 4 rows was used. (directory OS(C)/users/mbjhi)
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from random import randint
from selenium.common.exceptions import WebDriverException


# Function to scrape text data from a single URL using BeautifulSoup
def scrape_text(driver, url):
    time.sleep(randint(4, 9))  # Random sleep between 4 and 9 seconds
    driver.get(url)
    try:
        page_content = driver.page_source
        soup = BeautifulSoup(page_content, 'html.parser')
        text_element = soup.find('body')
        if text_element:
            transcript_text = text_element.get_text()
        else:
            transcript_text = None
        return transcript_text
    except Exception as e:
        print(f"Error occurred while scraping text from URL: {url}")
        print(e)
        return None


# Function to process a batch of URLs and scrape text data
def process_batch(driver, batch, writer):
    for row in batch:
        id = row['id']
        company_name = row['company_name']
        ticker = row['ticker']
        date = row['date']
        text = row['text']

        url = f"https://seekingalpha.com/api/v3/articles/{id}?include=author%2CprimaryTickers%2CsecondaryTickers%2CotherTags%2Cpresentations%2Cpresentations.slides%2Cauthor.authorResearch%2Cauthor.userBioTags%2Cco_authors%2CpromotedService%2Csentiments"
        scraped_text = scrape_text(driver, url)

        if scraped_text:
            text_chunks = [scraped_text[i:i + 8000] for i in
                           range(0, len(scraped_text), 8000)]
            for chunk in text_chunks:
                writer.writerow(
                    {'id': id, 'company_name': company_name, 'ticker': ticker,
                     'date': date, 'text': text, 'transcript_text': chunk})
        else:
            writer.writerow(
                {'id': id, 'company_name': company_name, 'ticker': ticker,
                 'date': date, 'text': text, 'transcript_text': ''})


# Function to process the CSV file and scrape text data
def process_csv(input_csv, output_csv):
    # Open the WebDriver
    driver = webdriver.Chrome()

    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        with open(output_csv, 'a', newline='', encoding='utf-8') as outfile:
            fieldnames = ['id', 'company_name', 'ticker', 'date', 'text',
                          'transcript_text']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

            batch = []
            for row in reader:
                batch.append(row)
                if len(batch) >= 15:
                    process_batch(driver, batch, writer)
                    batch = []
                    driver.quit()  # Close the WebDriver
                    time.sleep(randint(420,
                                       540))  # Random sleep between 7 and 9 minutes (420 and 540 seconds)
                    driver = webdriver.Chrome()  # Reopen the WebDriver

            # Process any remaining URLs
            if batch:
                process_batch(driver, batch, writer)

    # Close the WebDriver
    driver.quit()


try:
    # Process the CSV file and scrape text data
    process_csv('transcript_data_test_urls_data.csv',
                'transcriptscraped_test_data.csv')
except WebDriverException as e:
    print("Blocked by website")
    print(e)


In [2]:
# Test example line 776 in methods.py

import pandas as pd

df = pd.read_csv('transcripts_all_analysed_level1_grouped_modified_with_sector_updated.csv')

# Filter the DataFrame to include only rows where Call_Section == 'Q&A'
df_filtered = df[df['Call_Section'] == 'Q&A Session']
# print(df_filtered)


# Group by 'QTR' and aggregate the sum of 'Sum_of_Positive' and 'Sum_of_Negative'
df_grouped = df_filtered.groupby('QTR')[['Sum_of_Positive', 'Sum_of_Negative', 'Sum_of_Neutral']].sum().reset_index()
# df_grouped = df_filtered.groupby('Ticker')[['Sum_of_Positive', 'Sum_of_Negative']].sum().reset_index()
print('This is df grouped',df_grouped )


This is df grouped         QTR  Sum_of_Positive  Sum_of_Negative  Sum_of_Neutral
0   2021_01                0                0              13
1   2021_02                2                0              17
2   2021_03              147               19            1053
3   2021_04              739              135            5462
4   2022_01              819              193            6227
5   2022_02              897              273            6197
6   2022_03              600              210            4387
7   2022_04              831              266            5937
8   2023_01              931              275            5987
9   2023_02              953              324            6112
10  2023_03              844              268            6415
11  2023_04              752              190            5260
12  2024_01              179               38            1125
13  2024_02              130               33             666
14  2024_03               36                9      

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

# Line 685 in methods.py

df = pd.read_csv('transcripts_all_analysed_level1_grouped_modified_with_sector_updated.csv')
# Aggregate data to count unique IDs per QTR
unique_id_counts = df.groupby('QTR')['ID'].nunique().reset_index()
unique_id_counts.columns = ['QTR', 'Count of Transcripts']

# # Sort QTR in chronological order
unique_id_counts = unique_id_counts.sort_values(by='QTR')

# Create a bar chart. Uncomment below to show.
# plt.figure(figsize=(10, 6))
# plt.bar(unique_id_counts['QTR'], unique_id_counts['Count of Transcripts'])
# plt.xlabel('QTR')
# plt.ylabel('Count of Transcripts')
# plt.title('Count of Transcripts per QTR')
# plt.xticks(rotation=45)
# plt.tight_layout()

# # Save the plot to a file
# plot_file_path = 'unique_id_counts_per_qtr.png'
# plt.savefig('C:/Users/mbjhi/OneDrive/Desktop/BH/Thesis/Data/Production_data/Data_Analysis/unique_id_counts_per_qtr_new.png')
# plt.show()


In [9]:
# How to drop rows where there is no entry in a particular coulmn in that row
# https://www.aporia.com/resources/how-to/drop-rows-pandas-dataframe-column-vamue-nan/
df2 = df.dropna(subset=["QTR"])
sum_of_positive2 = df2['Sum_of_Positive'].sum()
print('sum_of_positive2', sum_of_positive2)
sum_of_positive2_byQTR = df2.groupby('QTR')['Sum_of_Positive'].sum()
print('sum_of_positive2_byQTR', sum_of_positive2_byQTR)

sum_of_negative2 = df2['Sum_of_Negative'].sum()
print('sum_of_negative2', sum_of_negative2)
sum_of_negative2_byQTR = df2.groupby('QTR')['Sum_of_Negative'].sum()
print('sum_of_negative2_byQTR', sum_of_negative2_byQTR)
tone = (sum_of_positive2 - sum_of_negative2) / (sum_of_positive2 + sum_of_negative2)
print('This is tone',tone)


sum_of_positive2 35274
sum_of_positive2_byQTR QTR
2021_01       6
2021_02      11
2021_03     712
2021_04    3781
2022_01    3702
2022_02    3938
2022_03    2650
2022_04    4117
2023_01    3712
2023_02    3911
2023_03    3832
2023_04    3524
2024_01     745
2024_02     463
2024_03     170
Name: Sum_of_Positive, dtype: int64
sum_of_negative2 7758
sum_of_negative2_byQTR QTR
2021_01       0
2021_02       0
2021_03      73
2021_04     580
2022_01     738
2022_02     851
2022_03     656
2022_04     931
2023_01     880
2023_02    1006
2023_03     964
2023_04     771
2024_01     165
2024_02     106
2024_03      37
Name: Sum_of_Negative, dtype: int64
This is tone 0.639431121026213


In [None]:

# Qs and As STEP 1
# ChatGPT  Q and A extraction problem Sun 08 Jul  09.36  Use this one. Note:This is the original code with an additional new function at the end
# Third version with headings added This works (the earlier two versions worked also but
# No1 wrote the output to the new file in plain text and no headings on either output file
# No2 wrote the output correctly in HTML format to the new file, but no headings on either output file
# See the ChatGPT word document!
# back to line 2140 (this code goes to 2669)
# This version below has headers added and works good
# This accesses the raw transcripts and extracts Qs and As separately
import pandas as pd
from bs4 import BeautifulSoup
import os


# Function to extract text between tags or provide notification if not found
def extract_text(data, start_tag, alt_start_tag, end_tag, alt_end_tag):
    start_index = data.find(start_tag)
    if start_index == -1:
        start_index = data.find(alt_start_tag)
        if start_index == -1:
            return "Tag not found"

    end_index = data.find(end_tag)
    if end_index == -1:
        end_index = data.find(alt_end_tag)
        if end_index == -1:
            return "Tag not found"

    text = data[start_index:end_index]
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text(strip=True)


# Function to extract raw HTML between tags or provide notification if not found
def extract_html(data, start_tag, alt_start_tag, end_tag, alt_end_tag):
    start_index = data.find(start_tag)
    if start_index == -1:
        start_index = data.find(alt_start_tag)
        if start_index == -1:
            return "Tag not found"

    end_index = data.find(end_tag)
    if end_index == -1:
        end_index = data.find(alt_end_tag)
        if end_index == -1:
            return "Tag not found"

    return data[start_index:end_index + len(end_tag)]


# Define the chunk size
chunk_size = 8000


# Function to split text into chunks
def split_text_into_chunks(text):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Read CSV file
# input_csv = 'transcriptscraped_test_data_testing.csv'
#BH Sun 07JUl
#Part1 below
# input_csv = 'transcriptscraped_test_data_master_Part1_12Apr.csv'
# input_csv = 'transcriptscraped_test_data_Sat03May_1930 to 2230_Part6_B.csv'
input_csv = 'transcriptscraped_test_data_master_Part1_12Apr.csv'
df = pd.read_csv(input_csv)

# Output file names
output_file = 'transcripts.csv'
output_qa_file = 'transcripts_Qs_and_As.csv'

# Remove output files if they exist (to avoid appending to old data during testing)
if os.path.exists(output_file):
    os.remove(output_file)
if os.path.exists(output_qa_file):
    os.remove(output_qa_file)

# Iterate over unique IDs
for id_val in df['id'].unique():
    # Filter data for the current ID
    df_id = df[df['id'] == id_val]

    # Get values
    company_name_val = df_id['company_name'].iloc[0]
    ticker_val = df_id['ticker'].iloc[0]
    date_val = df_id['date'].iloc[0]
    text_val = df_id['text'].iloc[0]
    transcript_text = df_id['transcript_text'].str.cat(
        sep=' ')  # Concatenate all transcript texts

    # Extract company statement text
    company_statement = extract_text(transcript_text,
                                     '<strong>Company Participants</strong>',
                                     '<strong>Corporate Participants</strong>',
                                     '<strong>Question-and-Answer Session</strong>',
                                     '<strong>Question-and-Answer Session</strong>')

    # Extract Q&A session text
    q_and_a = extract_text(transcript_text,
                           '<strong>Question-and-Answer Session</strong>',
                           '<strong>Question-and-Answer Session</strong>',
                           'twitContent',
                           'twitContent')

    # Split texts into chunks
    company_chunks = split_text_into_chunks(company_statement)
    qa_chunks = split_text_into_chunks(q_and_a)

    # Create DataFrame for company statement
    df_company = pd.DataFrame({'ID': id_val,
                               'Company Name': company_name_val,
                               'Ticker': ticker_val,
                               'Date': date_val,
                               'text': text_val,
                               'call_section': 'company_statement',
                               'transcript_text': company_chunks})

    # Create DataFrame for Q&A session
    df_qa = pd.DataFrame({'ID': id_val,
                          'Company Name': company_name_val,
                          'Ticker': ticker_val,
                          'Date': date_val,
                          'text': text_val,
                          'call_section': 'Q&A Session',
                          'transcript_text': qa_chunks})

    # Concatenate DataFrames
    df_concatenated = pd.concat([df_company, df_qa], ignore_index=True)

    # Write DataFrame to CSV with header
    if not os.path.exists(output_file):
        df_concatenated.to_csv(output_file, mode='w', index=False, header=True)
    else:
        df_concatenated.to_csv(output_file, mode='a', index=False,
                               header=False)


# New function to write Q&A data to a separate CSV file
def write_q_and_a_html_to_csv():
    # Read the same input CSV file again
    df = pd.read_csv(input_csv)

    all_q_and_a_html = []

    for id_val in df['id'].unique():
        # Filter data for the current ID
        df_id = df[df['id'] == id_val]

        # Get values
        company_name_val = df_id['company_name'].iloc[0]
        ticker_val = df_id['ticker'].iloc[0]
        date_val = df_id['date'].iloc[0]
        text_val = df_id['text'].iloc[0]
        transcript_text = df_id['transcript_text'].str.cat(
            sep=' ')  # Concatenate all transcript texts

        # Extract Q&A session HTML
        q_and_a_html = extract_html(transcript_text,
                                    '<strong>Question-and-Answer Session</strong>',
                                    '<strong>Question-and-Answer Session</strong>',
                                    'twitContent',
                                    'twitContent')

        # Split texts into chunks
        qa_chunks = split_text_into_chunks(q_and_a_html)

        # Create DataFrame for Q&A session HTML
        df_qa_html = pd.DataFrame({'ID': id_val,
                                   'Company Name': company_name_val,
                                   'Ticker': ticker_val,
                                   'Date': date_val,
                                   'text': text_val,
                                   'call_section': 'Q&A Session',
                                   'transcript_text': qa_chunks})

        # Collect all Q&A data
        all_q_and_a_html.append(df_qa_html)

    # Concatenate all Q&A data and write to a separate CSV with header
    if all_q_and_a_html:
        df_all_q_and_a_html = pd.concat(all_q_and_a_html, ignore_index=True)
        if not os.path.exists(output_qa_file):
            df_all_q_and_a_html.to_csv(output_qa_file, mode='w', index=False,
                                       header=True)
        else:
            df_all_q_and_a_html.to_csv(output_qa_file, mode='a', index=False,
                                       header=False)


# Call the new function to write Q&A data to the new CSV file
write_q_and_a_html_to_csv()


In [None]:

# Qs and As STEP 2
# ChatGPT remove specific text and lines from Q-A file Fri 05Jul 23.45
# go to line 2479 (this code goes to 2263)
import pandas as pd
from bs4 import BeautifulSoup
import re

# Load the CSV files into DataFrames
# df = pd.read_csv('transcriptscraped_test_data_testing.csv')
# df = pd.read_csv('C:/Users/mbjhi/PycharmProjects/Wk1Assign/Transcripts_Q_A/Part1/transcriptscraped_test_data_master_Part1_12Apr.csv')
# df = pd.read_csv('C:/Users/mbjhi/PycharmProjects/Wk1Assign/Transcripts_Q_A/Part1/transcriptscraped_test_data_testing.csv')
df = pd.read_csv('transcripts_Qs_and_As.csv')
additional_data = pd.read_csv('transcripts_all_analysed_level1_grouped_modified_with_sector_updated.csv')

# Combine text for each transcript ID
combined_texts = df.groupby('id')['transcript_text'].apply(lambda x: ' '.join(x)).reset_index()

# Function to clean text  (BH This is the original that works just not perfectly!)
def clean_text(text):
    # Remove 'good morning' or 'good evening' regardless of case
    #text = re.sub(r'(?i)good morning|good evening', '', text)
    # BH
    #text = re.sub(r'(?i)good morning|good evening|good afternoon|good question|great question|Thanks for taking the question|Thanks for the question|Thank you|Thanks|Great', '', text)
    text = re.sub(r'(?i)good morning|good evening|good afternoon|good question|great question|'
                  r'Thanks for taking the question|Thanks for the question|Thank you|Thanks|Great', '', text)
    return text

# Function to extract and clean questions and answers
def extract_questions_answers(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize variables to store questions and answers
    qa_list = []
    current_question = None
    current_answer = None

    # Iterate through paragraphs to find questions and answers
    for p in soup.find_all('p'):
        strong_tag = p.find('strong')
        if strong_tag:
            span_tag = strong_tag.find('span')
            if span_tag:
                span_class = span_tag.get('class')
                if span_class and 'question' in span_class[0]:
                    if current_question and current_answer:
                        qa_list.append(('Q', clean_text(current_question)))
                        qa_list.append(('A', clean_text(current_answer)))
                        current_answer = None
                    current_question = p.get_text(strip=True)
                elif span_class and 'answer' in span_class[0]:
                    current_answer = p.get_text(strip=True)
            else:
                continue
        else:
            if current_answer:
                current_answer += " " + p.get_text(strip=True)
            elif current_question:
                current_question += " " + p.get_text(strip=True)

    # Append the last question-answer pair if any
    if current_question and current_answer:
        qa_list.append(('Q', clean_text(current_question)))
        qa_list.append(('A', clean_text(current_answer)))

    return qa_list


# Initialize list to store the extracted data
extracted_data = []

# Process each combined text
for _, row in combined_texts.iterrows():
    transcript_id = row['id']
    html_content = row['transcript_text']
    print(f"Processing ID: {transcript_id}")
    print(f"HTML Content: {html_content[:500]}...")  # Print the first 500 characters of the HTML content for debugging

    qa_list = extract_questions_answers(html_content)
    if not qa_list:
        print(f"No Q&A pairs found for ID: {transcript_id}")

    for q_a, text in qa_list:
        # Filter out lines containing 'next question' and lines with fewer than ten words
        if 'next question' in text.lower() or len(text.split()) < 10:
            continue
        print(f"{q_a}: {text}")  # Print the question/answer for debugging
        extracted_data.append({
            'id': transcript_id,
            'Q/A': q_a,
            'Q_A_text': text
        })

# Convert the extracted data to a DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Print columns of additional_data to verify column names
print("Columns in additional_data:", additional_data.columns)

# Rename 'ID' column in additional_data to 'id' for consistency
additional_data.rename(columns={'ID': 'id'}, inplace=True)

# Check for the existence of 'id' column
if 'id' not in additional_data.columns:
    raise KeyError("'id' column not found in additional_data")

# Define the columns to add
columns_to_add = ['Company_Name', 'Ticker', 'GICS Sector', 'Text', 'QUARTER', 'QTR', 'day_date_formatted']

# Check if all columns to add exist in additional_data
missing_columns = [col for col in columns_to_add if col not in additional_data.columns]
if missing_columns:
    raise KeyError(f"Columns {missing_columns} not found in additional_data")

# Aggregate additional_data to ensure one row per 'id'
aggregated_additional_data = additional_data.groupby('id')[columns_to_add].first().reset_index()

# Merge with the additional data based on 'id' column
merged_df = extracted_df.merge(aggregated_additional_data, on='id', how='left')

# Reorder columns to place the new columns after 'id'
columns_order = ['id'] + columns_to_add + ['Q/A', 'Q_A_text']
merged_df = merged_df[columns_order]

# Write the merged data to a new CSV file
merged_df.to_csv('Transcripts_Qs_and_As_part1.csv', index=False)


In [17]:
# python
# Copy code
import zipfile
import os

# Define the path to the uploaded zip file (assuming it's in the same directory as the notebook)
zip_file_path = 'FinBERT output files combined.zip'
# Define the directory to unzip files (same directory as the notebook)
unzip_folder_path = 'FinBERT output files combined'

# Create a directory to unzip the files into
os.makedirs(unzip_folder_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_folder_path)

print(f"Files unzipped to: {unzip_folder_path}")



Files unzipped to: FinBERT output files combined


In [33]:
# Qs and As Step 3

# ChatGPT Transcripts Qs and As combine and group individual files Fri 12Jul 0659

# Mon 15Jul This is line 326 MGMT.py
# Update to place output in same folder as py.charm and change name of fields
# Pg8
import pandas as pd
import os

# Define the directory containing the CSV files (in the same location as the script)
directory = 'FinBERT output files combined'

# Define the fields to group by
group_fields = ['ID', 'Call_Section']

# Define the fields to sum
sum_fields = ['count of +ve', 'count of -ve', 'count of neutral']

# Define the fields to keep
keep_fields = ['Company_AName', 'Ticker', 'Text']

# Initialize an empty list to store the grouped DataFrames
grouped_dataframes = []

# Walk through the directory to find all CSV files
for root, dirs, files in os.walk(directory):
    for filename in files:
        if filename.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(root, filename)
            # print(f"Processing file: {file_path}")

            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)

                # Ensure the sum fields are numeric
                for field in sum_fields:
                    df[field] = pd.to_numeric(df[field], errors='coerce')

                # Select the necessary fields
                df = df[group_fields + sum_fields + keep_fields]

                # Group the DataFrame by the specified fields and sum the required fields
                grouped_df = df.groupby(group_fields + keep_fields)[sum_fields].sum().reset_index()

                # Append the grouped DataFrame to the list
                grouped_dataframes.append(grouped_df)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# Check if any DataFrames were processed
if not grouped_dataframes:
    print("No valid CSV files were processed.")
else:
    # Concatenate all the grouped DataFrames
    combined_df = pd.concat(grouped_dataframes)

    # Group the combined DataFrame again to ensure proper aggregation
    combined_df = combined_df.groupby(group_fields + keep_fields)[sum_fields].sum().reset_index()

    # Rename the columns as required
    combined_df = combined_df.rename(columns={
        'count of +ve': 'sum of +ve',
        'count of -ve': 'sum of -ve',
        'count of neutral': 'sum of neutral'
    })

    # Define the output file path
    output_file_path = 'Qs_and_As_grouped.csv'

    # Save the combined DataFrame to a new CSV file in the current working directory
    combined_df.to_csv(output_file_path, index=False)

    print(f"Grouping and combination of CSV files is complete. Output saved to '{output_file_path}'.")


Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part1.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part10.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part2.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part3.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part4.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part5A.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part5B.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part6A.csv
Processing file: FinBERT output files combined\FinBERT output files combined\earnings_calls_Kaggle_part6B.csv
Processing fil

In [2]:
# Qs and As Step 4

# ChatGPT new columns and update ‘GCIS_Sector’ ‘QTR’  and ‘day_date’ in file ‘combined.csv’ Fri 12 Jul
# Line 393 in MGMT.py
import pandas as pd

# Read the CSV file
file_path = 'Qs_and_As_grouped.csv'
df = pd.read_csv(file_path)

# Define the new columns with default values or empty strings
df['GICS Sector'] = ''  # You can replace '' with default values if needed
df['QTR'] = ''          # You can replace '' with default values if needed
df['day_date'] = ''     # You can replace '' with default values if needed
#BH added tone,price_chng_5day,price_chng_2day,price_chng_1day Fri 12 Jul
df['tone'] = ''     # You can replace '' with default values if needed
df['price_chng_5day'] = ''     # You can replace '' with default values if needed
df['price_chng_2day'] = ''     # You can replace '' with default values if needed
df['price_chng_1day'] = ''     # You can replace '' with default values if needed


# Reorder columns to place the new columns between 'Call_Section' and 'Company_Aname'
# columns = ['ID', 'Call_Section', 'GICS Sector', 'QTR', 'day_date', 'Company_AName', 'Ticker', 'Text', 'sum of +ve', 'sum of -ve', 'sum of neutral', 'tone', 'price_chng_5day','price_chng_2day', 'price_chng_1day' ]
# df = df[columns]
#BH 15 Jul
columns = ['ID', 'Call_Section', 'GICS Sector', 'QTR', 'day_date', 'Company_AName', 'Ticker', 'Text', 'sum of +ve', 'sum of -ve', 'sum of neutral', 'tone', 'price_chng_5day','price_chng_2day', 'price_chng_1day' ]
df = df[columns]



# sum of +ve	sum of -ve	sum of neutral

# Save the modified DataFrame to a new CSV file
output_file_path = 'updated_Qs_and_As_grouped.csv'
df.to_csv(output_file_path, index=False)

print(f"Updated file saved as {output_file_path}")


Updated file saved as updated_Qs_and_As_grouped.csv


In [3]:

# Qs and As Step 5

#ChatGPT Update new columns GICS Sector, Day_Date, QTR in file combined 12Jul20.28
# Line 430 in MGMT.py
import pandas as pd

# Load the CSV files into DataFrames
df_combined = pd.read_csv('updated_Qs_and_As_grouped.csv')
df_transcripts = pd.read_csv('transcripts_all_analysed_level1_grouped_modified_with_sector_updated.csv')

# Convert the 'ID' columns to strings to ensure they match in type
df_combined['ID'] = df_combined['ID'].astype(str)
df_transcripts['ID'] = df_transcripts['ID'].astype(str)

# Group the transcripts DataFrame by 'ID'
grouped_transcripts = df_transcripts.groupby('ID')

# Function to update row with new data if available
def update_row(row):
    if row['ID'] in grouped_transcripts.groups:
        # Get the group for this ID
        group = grouped_transcripts.get_group(row['ID'])
        # Use the first occurrence of each value for simplicity
        row['GICS Sector'] = group['GICS Sector'].values[0]
        row['day_date'] = group['day_date_formatted'].values[0]
        row['QTR'] = group['QTR'].values[0]
    return row

# Apply the update_row function to each row in the combined DataFrame
df_combined = df_combined.apply(update_row, axis=1)

# Save the updated DataFrame to a new CSV file
df_combined.to_csv('updated_Qs_and_As_grouped_updated.csv', index=False)
