# Initial Set-up

In [13]:
# Installation of packages if needed
#!pip install nltk
#!pip install pandas
#!pip install openpyxl


In [1]:
# Import required packages
import pandas as pd
import os
import re
import nltk
nltk.download( 'vader_lexicon' )
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('punkt_tab')


# base_url = "https://www.roic.ai/quote/{company}/transcripts/{year}-year/{quarter}-quarter"  -- for reference this is where the data is scraped from

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\agbea\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agbea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\agbea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Get the current working directory
    #os.chdir("c:")
current_directory = os.getcwd()
print(current_directory)

# Project

The team scraped web earnings calls from roic.ac. We pulled data from a variety of industries to analyze the change in sentiment toward AI overtime. The timeframe ranged from Quarter (Q) 4 of 2021 to Q4 of 2024; however, there was not data fully available for Q4 of 2024 at the time of extraction so the final analysis finished with data from Q3 of 2023.

In [7]:
df= pd.read_csv("Text_Analytics/earnings_df.csv")  ### put your directory here
df.head(n=3)

Unnamed: 0,earnings_date,AAPL,NVDA,MSFT,PFE,GSK,MRNA,F,GM,TSLA,PYPL,JPM,SQ
0,2024_Q4,,Operator\n\nGood afternoon.\n\n \n\nMy name is...,Operator\n\nGreetings and welcome to the Micro...,,,,,,,,,
1,2024_Q3,\n\n\nSuhasini Chandramouli\n\nGood afternoon ...,Operator\n\nGood afternoon.\n\n \n\nMy name is...,Operator\n\nGreetings and welcome to the Micro...,,,,,,,,,
2,2024_Q2,"\n\n\nSuhasini Chandramouli\n\nGood Afternoon,...",Operator\n\nGood afternoon.\n\n \n\nMy name is...,"Operator\n\nGreetings, and welcome to the Micr...","Operator\n\nGood day, everyone, and welcome to...",\n\n\nNick Stone\n\nHello everyone.\n\n \n\nWe...,Operator\n\nGood day and thank you for standin...,"Operator\n\nGood day, everyone.\n\n \n\nMy nam...","Operator\n\nGood morning, and welcome to the G...","\n\n\nTravis Axelrod\n\nGood afternoon, everyo...",Operator\n\nGood morning and welcome to PayPal...,Operator\n\nGood morning ladies and gentlemen....,"Operator\n\nGood day, ladies and gentlemen, an..."


# Define Function

This function will read in specific company columns from the dataframe. The function makes each sentence it's own observation and make everything lowercase for consistency. The data gets filtered to just sentences that contain the word "ai" or "artificial intelligence"

Also, within the function, the SentimentIntensityAnalyzer() from NLKT is applied to the final sentences containing ai-related terms. A sentiment score is given to each sentence.

In [8]:
# DEFFINE A FUNCTION TO USE ON EACH COLUMN FOR SENTIMENT ANALYSIS.
def AI_sentiment_extraction(column):
    #~~~~~~~~~~~~~~CLEANING~~~~~~~~~~~~~~~#
    # Cleaning the dataframe
    company_df = df[['earnings_date', column]]
    company_df.head()
    company_df = company_df.dropna()  ### dropping NaNs because this can cause problems during the sentiment analysis

    # converting everything to lowercase to avoid case matching
    company_df[column]= company_df[column].str.lower()
    #print("Checking lowercase:", company_df.head(n=5))

    #~~~~~~~~~~~SPLIT INTO SENTENCES~~~~~~~~~~~~~~~#
    # Function to split a paragraph into sentences
    def split_into_sentences(paragraph):
        return nltk.sent_tokenize(paragraph)

    # Apply the function to split sentences and explode them into separate rows
    company_df['sentences'] = company_df[column].apply(split_into_sentences)
    company_split = company_df.explode('sentences').reset_index(drop=True)

    # Drop the original column with the full paragraph, keep other columns and split sentences
    company_split  = company_split[['earnings_date', 'sentences']]  # Keeping 'quarter' and the split sentences
    #print("Dataframe shape:", company_split.shape)

    #~~~~~~~~~~~~~~~~~~~~COUNT BY KEY AI WORDS~~~~~~~~~~~~~~~~~~~#
    # Create counts of key words in the observations
    company_split['AI_ct'] = company_split['sentences'].str.count(r'\bai\b')
    company_split['Artificial_ct'] = company_split['sentences'].str.count(r'\bartificial intelligence\b')
    company_split['Total_AI_Terms_ct'] = company_split['AI_ct'] + company_split['Artificial_ct']

    company_overtime = company_split ### going to use later on to have counts by quarter
    company_split.head(n=25)

    # Filter to just AI terms
    company_split = company_split[company_split['Total_AI_Terms_ct'] > 0].reset_index() ### resetting index for cleaner df
    #print("Checking that filter is working",company_split.head(n=5))

    #~~~~~~~~~~~~~~~~~~~ANALYZE SENTIMENTS~~~~~~~~~~~~~~~~~#
    #  Convert to sentences, create VADER sentiment analyzer
    sentiment = SentimentIntensityAnalyzer()

    #  Define a function to analyze sentiment
    def find_sentiment(text):
        return sentiment.polarity_scores(text)['compound']


    company_split['sentiment'] = company_split['sentences'].apply(find_sentiment)
    #print("Checking that sentiment is there",company_split.head(n=5))

    #~~~~~~~~~~~~~~~~~~COUNTS OVERTIME~~~~~~~~~~~~~~~~~~~~~#
    company_overtime
    company_overtime = company_overtime.groupby("earnings_date")['Total_AI_Terms_ct'].sum().reset_index()
    #print(company_overtime)

    #~~~~~~~~~~~~~~~~~RETURN DF for ANALYSIS~~~~~~~~~~~~~~~~#
    return company_split, company_overtime


# Apply sentiment function to each company
This runs the above function on each column/company to clean and apply a sentiment score to each company.


In [10]:
# Assuming you have a list of companies
companies = ['AAPL', 'NVDA', 'MSFT', 'PFE', 'GSK', 'MRNA', 'F',	'GM', 'TSLA', 'PYPL', 'JPM', 'SQ']  # Add more companies as needed

# Setting industries in a dictionary
industry_map = {
    'AAPL': 'tech',
    'NVDA': 'tech',
    'MSFT': 'tech',
    'PFE': 'pharma',
    'GSK': 'pharma',
    'MRNA':'pharma',
    'F': 'automotive',
    'GM': 'automotive',
    'TSLA': 'automotive',
    'PYPL': 'financial services',
    'JPM': 'financial services',
    'SQ': 'financial services'
}
# Initialize lists to store results
sentiment_dfs = []
overtime_dfs = []

for company in companies:
    sentiment, overtime = AI_sentiment_extraction(company)

    # Add industry and company columns
    sentiment['industry'] = industry_map.get(company, 'unknown')  # Default to 'unknown' if not found
    sentiment['company'] = company

        # Add industry and company columns
    overtime['industry'] = industry_map.get(company, 'unknown')  # Default to 'unknown' if not found
    overtime['company'] = company

    # Append to lists
    sentiment_dfs.append(sentiment)
    overtime_dfs.append(overtime)

# Optionally, concatenate all sentiment and overtime DataFrames
all_sentiment = pd.concat(sentiment_dfs, ignore_index=True)
all_overtime = pd.concat(overtime_dfs, ignore_index=True)



In [11]:
all_sentiment.to_excel("Text_Analytics/all_company_sentiments.xlsx", sheet_name='All Company Sentiment', index=False)

In [12]:
all_overtime.to_excel("Text_Analytics/all_company_overtime.xlsx", sheet_name ='Overtime', index=False)