# __Group Assignment__

__Submission Date:__ 26/03/2025

__Assignment: NLP-2024-2025: Assignment 1__

__Group Members:__
- ANZALONE Gabriel
- MBENGUE Ndèye Arame

__Firm level topic of discussion:__ Technological Disruption

__Task__ : Measure <u>Technological Disruption</u> with transcripts of Conference Calls.

## __1- Constructing a measure related to <u> Technological Disruption</u> for each firm and each quarter.__

### 1.A- Data Import and Preparation

Loading data

In [None]:
# Lets say that we just want to focus on the management presentation section of the Earnings Calls
# We import Earnings Calls of S&P500 from 2015 to 2021

import pandas as pd

Sample_EC = pd.read_csv("https://www.dropbox.com/scl/fi/2p7ahxroqj9pwf98ni5an/Sample_Calls.csv?rlkey=zfieicvz891u4e3z0aroeg0u7&dl=1")
Sample_Presentations = pd.read_feather("https://www.dropbox.com/scl/fi/uceh2xva5g4apbmt92cgt/Sample_Calls_Presentations.feather?rlkey=ln4nzsa4nenqyvm0pg2cur9sp&dl=1")

Sample_Presentations = Sample_Presentations[
    Sample_Presentations['presentation'].str.split().apply(len) > 50 # filter to keep presentations with at least 50 words
].reset_index(drop=True)

# Group by file_name and concatenate presentation column
Pres = Sample_Presentations.groupby("file_name")["presentation"].agg(lambda x: " ".join(x)).reset_index()
Pres.columns= ["file_name","text"]

Prepprocessing the text

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# We first tokenize words & remove the stopwords
stop_words = set(stopwords.words('english')) # preprocessing all words, removing the stopwords and using tokenization

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # putting all in lowercase format
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

Pres['tokens'] = Pres['text'].apply(preprocess_text)

In [None]:
# Then we lemmatize to have a unified format
# Either use WordNetLemmatizer or Spacy -> better to use the latter to take the context of each word into account (POS Tagging)
# But it is way Longer than WordNet to run

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

Pres['tokens'] = Pres['tokens'].apply(lemmatize_text)

In [None]:
Pres['tokens'].head(10)

Unnamed: 0,tokens
0,"[lady, gentleman, thank, standing, welcome, fi..."
1,"[welcome, prudential, quarterly, earnings, cal..."
2,"[lady, gentleman, thank, standing, welcome, pr..."
3,"[lady, gentleman, thank, standing, welcome, q1..."
4,"[lady, gentleman, thank, standing, welcome, q2..."
5,"[thank, tom, good, morning, everyone, welcome,..."
6,"[welcome, southwest, airline, second, quarter,..."
7,"[thank, tom, good, morning, everyone, welcome,..."
8,"[lady, gentleman, thank, standing, welcome, an..."
9,"[lady, gentleman, thank, standing, welcome, an..."


In [None]:
Pres['text'].isnull().sum() # 0 null value

0

### 1 - Using Cosine Similarity

Working with Google Word2Vec API

In [None]:
%%capture
!pip uninstall -y numpy gensim scipy smart-open
!pip install numpy==1.25.2 scipy gensim smart-open
# After this, restart session (don't delete runtime)

In [None]:
import gensim.downloader as api
google_model = api.load("word2vec-google-news-300")

In [None]:
# What we will do is select a bunch of words based on their similarity with Technological Disruption and add them to our tech dictionnary
# First using: "most_similar" function of google_model
# Limit = we can use only one word
print(google_model.most_similar("technological", topn=5))

[('technological_advancement', 0.6745783090591431), ('technological_advances', 0.6342653036117554), ('technological_innovations', 0.6328529715538025), ('technologic', 0.6312510967254639), ('DISABILITY_RESOURCE_CENTER', 0.6264010667800903)]


In [None]:
print(google_model.most_similar("disruption", topn=5))

[('disruptions', 0.8658042550086975), ('interruption', 0.658836841583252), ('interruptions', 0.6469897627830505), ('distruption', 0.6304445862770081), ('disrupted', 0.6218001842498779)]


In [None]:
print(google_model.most_similar("technological_innovations", topn=5))

[('innovations', 0.7695900797843933), ('technological_advances', 0.7287052869796753), ('technological_advancements', 0.7118793725967407), ('technological_advancement', 0.6809303760528564), ('technologies', 0.6743346452713013)]


In [None]:
# Lets say we want to create a bag of words that are closely linked to technological disruption:
# We can select among these and add our proper words to create a dictionnary:

keywords = ["disruption","technological","interruption","advancement","innovation","ai","automation", "robotics", "technology"]

Computing Cosine Similarity mean of this vector

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# If we take the mean of all of these keywords in the model, and word with the cosine similarity:
tech_vectors = np.mean([google_model[word] for word in keywords if word in google_model], axis=0)

def compute_similarity(tokens):
    vectors = np.array([google_model[word] for word in tokens if word in google_model])
    if vectors.size == 0:
        return 0  # Aucun mot trouvé dans le modèle
    avg_vector = np.mean(vectors, axis=0)
    similarity = cosine_similarity([avg_vector], [tech_vectors])[0][0]
    return similarity

Pres['tech_disruption_score'] = Pres['tokens'].apply(compute_similarity) # affecting a score to each

Filtering Companies based on their score

In [None]:
Pres = Pres.merge(Sample_EC[['file_name', 'co_conm']], on='file_name', how='left') # we merge the data to get the names of the companies

sorted_pres = Pres.sort_values(by='tech_disruption_score', ascending=False) # filtering by the disruption score

top_10 = sorted_pres[['file_name', 'co_conm', 'tech_disruption_score']].head(10)
bottom_10 = sorted_pres[['file_name', 'co_conm', 'tech_disruption_score']].tail(10)

In [None]:
print("These are the Top 10 companies related to Technological Disruption:")
print(top_10)

print("\nThese are the Bottom 10 companies related to Technological Disruption:")
print(bottom_10)

These are the Top 10 companies related to Technological Disruption:
                                              file_name  \
2557  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2020...   
1948  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2019...   
446   Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2016...   
2805  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2020...   
2595  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2020...   
695   Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2017...   
780   Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2017...   
1996  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2019...   
1464  Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2018...   
510   Download ECC/SE/TRANSCRIPT/XMLStd/Archive/2017...   

                       co_conm  tech_disruption_score  
2557               NVIDIA CORP               0.597319  
1948                 ANSYS INC               0.596867  
446                NVIDIA CORP               0.591857  
2805               NVIDIA CORP               0.588900  
25

<b>Comments on this:</b>
- We see pure technological players (what we would expect actually): Nvidia, Synopsis (operating in the semiconductor field)
- But we also observe Netflix within the Bottom 10, even though it ictually is related to technological disruption.

Piste d'amélioration à ce niveau : améliorer la recherche des keywords ?

In [None]:
# Calculer la similarité cosinus entre la moyenne des vecteurs des mots-clés et tous les mots du modèle
def get_most_similar_words_to_avg_vector(model, avg_vector, top_n=10):
    similarities = {}
    for word in model.index_to_key:  # Iterer sur tous les mots dans le modèle
        word_vector = model[word]
        similarity = cosine_similarity([avg_vector], [word_vector])[0][0]  # Calculer la similarité cosinus
        similarities[word] = similarity

    # Trier les mots en fonction de la similarité cosinus et obtenir les 'top_n' mots les plus proches
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]

# Obtenir les 10 mots les plus proches de la moyenne des vecteurs des mots-clés
most_similar_words = get_most_similar_words_to_avg_vector(google_model, tech_vectors, top_n=10)

# Afficher les mots les plus similaires à la moyenne des mots-clés
print("Most similar words to the average of technological disruption keywords:")
for word, similarity in most_similar_words:
    print(f"{word}: {similarity}")

In [None]:
import matplotlib.pyplot as plt

# Ajouter une colonne 'quarter' à partir de la date
Pres['quarter'] = pd.to_datetime(Sample_EC['date_rdq']).dt.to_period('Q')

# Agréger les scores de disruption technologique par trimestre
quarterly_scores = Pres.groupby('quarter')['tech_disruption_score'].mean()

# Tracer les résultats
plt.figure(figsize=(10, 6))
quarterly_scores.plot(kind='line')
plt.title("Technological Disruption Over Time")
plt.xlabel("Quarter")
plt.ylabel("Average Technological Disruption Score")
plt.grid(True)
plt.show()


In [None]:
# 4. Explorer la réaction du marché (exemple avec des rendements boursiers)
# Remplacer par les rendements boursiers réels si disponibles dans le dataset
Pres['stock_return'] = Sample_EC['CAR-11-Carhart']  # Exemple d'une variable de rendement

# Calculer la corrélation entre les scores de disruption technologique et les rendements boursiers
correlation = Pres['tech_disruption_score'].corr(Pres['stock_return'])
print(f"Correlation between tech disruption score and stock return: {correlation}")

In [None]:
import statsmodels.api as sm

# 1. Préparer les données pour la régression
# Dépendant : CAR-11-Carhart (rendement boursier, ou autre variable)
# Indépendant : tech_disruption_score (score de disruption technologique)

# Ajouter les rendements boursiers réels ou toute autre variable financière
Pres['stock_return'] = Sample_EC['CAR-11-Carhart']  # Remplacer par les rendements réels si disponibles

# Supprimer les valeurs manquantes pour éviter les erreurs dans la régression
Pres.dropna(subset=['tech_disruption_score', 'stock_return'], inplace=True)

# Variables indépendantes (score de disruption technologique)
X = Pres[['tech_disruption_score']]

# Variable dépendante (rendement boursier)
y = Pres['stock_return']

# Ajouter une constante à la matrice des variables indépendantes pour le modèle
X = sm.add_constant(X)

# S'assurer que les indices sont cohérents après suppression des valeurs manquantes
y = y.loc[X.index]

# Convertir les variables en numériques (si nécessaire)
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# 2. Ajuster le modèle de régression (modèle OLS)
ols_model = sm.OLS(y, X).fit()

# 3. Afficher le résumé de la régression
print(ols_model.summary())


In [None]:
# R squared = 0 and non significance...

2 - Using Doc2Vec