# Importation des données

In [49]:
# pip install google-colab
# !pip install nltk
# nltk.download()

In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
file_path = "../Data/companies_description.csv"
data = pd.read_csv(file_path)
data = data.drop(columns='Unnamed: 0')

condition = (data['companyName'].isna()) | (data['description'].isna() | data['country'].isna())
index_to_drop = data[condition].index
data = data.drop(index_to_drop)

data = data.reset_index(drop=True)

In [52]:
data

Unnamed: 0,companyName,exchange,industry,website,description,sector,country
0,Visa Inc.,New York Stock Exchange,Credit Services,https://usa.visa.com,Visa Inc. operates as a payments technology co...,Financial Services,US
1,"Texas Roadhouse, Inc.",NASDAQ Global Select,Restaurants,https://www.texasroadhouse.com,"Texas Roadhouse, Inc., together with its subsi...",Consumer Cyclical,US
2,"Lamb Weston Holdings, Inc.",New York Stock Exchange,Packaged Foods,https://www.lambweston.com,"Lamb Weston Holdings, Inc. produces, distribut...",Consumer Defensive,US
3,Intact Financial Corporation,Toronto Stock Exchange,Insurance—Property & Casualty,https://www.intactfc.com,"Intact Financial Corporation, through its subs...",Financial Services,CA
4,"OPKO Health, Inc.",NASDAQ Global Select,Diagnostics & Research,https://www.opko.com,"OPKO Health, Inc., a healthcare company, engag...",Healthcare,US
...,...,...,...,...,...,...,...
37579,Relativity Acquisition Corp.,NASDAQ Global Market,Shell Companies,https://www.relativityacquisitions.com,Relativity Acquisition Corp. does not have sig...,Financial Services,US
37580,Prima Industrie SpA,Milan,Specialty Industrial Machinery,https://www.primaindustrie.com,"Prima Industrie SpA develops, manufactures, an...",Industrials,IT
37581,PT Putra Rajawali Kencana Tbk,Jakarta Stock Exchange,Trucking,https://puratrans.com,PT Putra Rajawali Kencana Tbk engages in the t...,Industrials,ID
37582,"Agritek Holdings, Inc.",Other OTC,Real Estate Services,https://www.agritekholdings.com,"Agritek Holdings, Inc. distributes hemp and ca...",Real Estate,US


# Tokenization : plus besoin de le run, car le fichier data_tokenized.csv est créé

## NLTK

In [53]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
# desc0 = data['description'].tolist()[0]

In [55]:
# sentences = sent_tokenize(desc0)
# words = word_tokenize(desc0)

In [56]:
# [wrd.lower() for wrd in words if wrd.lower() not in stopwords.words('english') and wrd not in string.punctuation]

(avec stemming) temps d'execution pour 37 000 entreprises : 13 minutes
(sans stemming) temps d'execution pour 37 000 entreprises : 11 minutes


In [57]:
# stemmer = nltk.stem.SnowballStemmer('english')
# pstemmer = nltk.stem.PorterStemmer()
# lemma = nltk.wordnet.WordNetLemmatizer()
# lemma.lemmatize('walking')
# stemmer.stem('walking')
# opko, health, inc., healthcare, company

In [58]:
stemmer = nltk.stem.SnowballStemmer('english')

def preprocess_description(description, stem=True):
  words = word_tokenize(description)
  if stem:
    filtered_words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stopwords.words('english') and word not in string.punctuation]
  else:
    filtered_words = [word.lower() for word in words if word.lower() not in stopwords.words('english') and word not in string.punctuation]

  return filtered_words

# data = data.iloc[:100]
data['description'] = data['description'].apply(preprocess_description)

KeyboardInterrupt: 

In [None]:
data.head()

Unnamed: 0,companyName,exchange,industry,website,description,sector,country
0,Visa Inc.,New York Stock Exchange,Credit Services,https://usa.visa.com,"[visa, inc., oper, payment, technolog, compani...",Financial Services,US
1,"Texas Roadhouse, Inc.",NASDAQ Global Select,Restaurants,https://www.texasroadhouse.com,"[texa, roadhous, inc., togeth, subsidiari, ope...",Consumer Cyclical,US
2,"Lamb Weston Holdings, Inc.",New York Stock Exchange,Packaged Foods,https://www.lambweston.com,"[lamb, weston, hold, inc., produc, distribut, ...",Consumer Defensive,US
3,Intact Financial Corporation,Toronto Stock Exchange,Insurance—Property & Casualty,https://www.intactfc.com,"[intact, financi, corpor, subsidiari, provid, ...",Financial Services,CA
4,"OPKO Health, Inc.",NASDAQ Global Select,Diagnostics & Research,https://www.opko.com,"[opko, health, inc., healthcar, compani, engag...",Healthcare,US


In [None]:
type(data.iloc[0]['description'])

list

In [None]:
data.to_csv("../Data/data_tokenized.csv", index=True)

In [None]:
old_data = data.copy()

# Vectorisation

temps d'exe pour 37000 : 8 sec

In [59]:
def load_preprocess_data_tokenized():
    data = pd.read_csv("../Data/data_tokenized.csv")
    data = data.drop(columns='Unnamed: 0')
    data['description'] = data['description'].apply(eval)  # pour reconvertir la string en list
    # data['country'] = data['country'].apply(eval)  # pour reconvertir la string en list
    data = data.drop_duplicates('companyName')
    data['description'] = data.apply(lambda row: row['description'] + [row['country'], row['sector']], axis=1)
    data = data.drop(columns=['exchange', 'website'])
    data['country'] = data['country'].replace({'GB':'UK',
                                               'TW':'CN',
                                               'GG':'FR',
                                               'RE':'FR',
                                               'GF':'FR',
                                               'MQ':'UK',
                                               'JE':'UK',
                                               'AI':'UK',
                                               'AN':'NL',
                                               'GI':'UK'})
    return data

In [60]:
load_preprocess_data_tokenized()

Unnamed: 0,companyName,industry,description,sector,country
0,Visa Inc.,Credit Services,"[visa, inc., operates, payments, technology, c...",Financial Services,US
1,"Texas Roadhouse, Inc.",Restaurants,"[texas, roadhouse, inc., together, subsidiarie...",Consumer Cyclical,US
2,"Lamb Weston Holdings, Inc.",Packaged Foods,"[lamb, weston, holdings, inc., produces, distr...",Consumer Defensive,US
3,Intact Financial Corporation,Insurance—Property & Casualty,"[intact, financial, corporation, subsidiaries,...",Financial Services,CA
4,"OPKO Health, Inc.",Diagnostics & Research,"[opko, health, inc., healthcare, company, enga...",Healthcare,US
...,...,...,...,...,...
37577,"Global Brokerage, Inc.",Financial Data & Stock Exchanges,"[global, brokerage, inc., subsidiaries, provid...",Financial Services,US
37580,Prima Industrie SpA,Specialty Industrial Machinery,"[prima, industrie, spa, develops, manufactures...",Industrials,IT
37581,PT Putra Rajawali Kencana Tbk,Trucking,"[pt, putra, rajawali, kencana, tbk, engages, t...",Industrials,ID
37582,"Agritek Holdings, Inc.",Real Estate Services,"[agritek, holdings, inc., distributes, hemp, c...",Real Estate,US


In [None]:
# data['description'] = data['description'].apply(lambda x: [token for token in x if not token.isdigit()])

Temps d'execution pour 37 000 entreprises : 13s

In [61]:
from scipy import sparse

def build_tfidf(df):
  tfidf_vectorizer = TfidfVectorizer(max_features=5000)

  tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'].astype(str))

  list(tfidf_matrix)
  # print(pd.Series(tfidf_matrix.toarray()))
  sparse.save_npz("../Data/tfidf_matrix.npz", tfidf_matrix)

  df['vec_tfidf'] = list(tfidf_matrix)
  # data['vec_tfidf']=tfidf_matrix.toarray()

  # feature_names = tfidf_vectorizer.get_feature_names_out()
  return df

data = load_preprocess_data_tokenized()
data = build_tfidf(data)

In [64]:

data['vec_tfidf'].iloc[0].toarray()
# Assuming tfidf_matrix is your TF-IDF sparse matrix
print(data['vec_tfidf'].iloc[0].toarray())   

5000


In [65]:
# data['vec_tfidf'].values[0].toarray()

## Word2Vec

In [66]:
desc0 = data['description'][0]
desc0

['visa',
 'inc.',
 'operates',
 'payments',
 'technology',
 'company',
 'worldwide',
 'company',
 'facilitates',
 'digital',
 'payments',
 'among',
 'consumers',
 'merchants',
 'financial',
 'institutions',
 'businesses',
 'strategic',
 'partners',
 'government',
 'entities',
 'operates',
 'visanet',
 'transaction',
 'processing',
 'network',
 'enables',
 'authorization',
 'clearing',
 'settlement',
 'payment',
 'transactions',
 'addition',
 'company',
 'offers',
 'card',
 'products',
 'platforms',
 'value-added',
 'services',
 'provides',
 'services',
 'visa',
 'visa',
 'electron',
 'interlink',
 'vpay',
 'plus',
 'brands',
 'visa',
 'inc.',
 'strategic',
 'agreement',
 'ooredoo',
 'provide',
 'enhanced',
 'payment',
 'experience',
 'visa',
 'cardholders',
 'ooredoo',
 'customers',
 'qatar',
 'visa',
 'inc.',
 'founded',
 '1958',
 'headquartered',
 'san',
 'francisco',
 'california',
 'US',
 'Financial Services']

In [71]:
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
from gensim.downloader import load

model = load('word2vec-google-news-300')

path = api.load("word2vec-google-news-300", return_path=True)

# Chargez le modèle Word2Vec
word2vec_model = KeyedVectors.load_word2vec_format(path, binary=True)
#model = Word2Vec.load(path)

def calculate_average_word2vec(tokens, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for token in tokens:
        if token in model:  # Directly check in the model
            nwords += 1
            feature_vector = np.add(feature_vector, model[token])  # Directly access the vector

    if nwords > 0:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

data['description_word2vec'] = data['description'].apply(
    lambda x: calculate_average_word2vec(x, model, num_features=300)
)



# Calcul de distances

### Cosine distance

In [None]:
# data.loc[data['companyName'] == 'Visa Inc.']['vec_tfidf'].iloc[0].toarray()
# data

In [72]:
from numpy.linalg import norm

def euclidean_distance(vec1, vec2):
    return norm(vec1 - vec2)

Temps d'execution pour 37 000 entreprises : plante car la RAM est insuffisante (13 gb)

In [88]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial import distance
import pandas as pd

def find_similar_companies(target_company, df, vector_column='vec_tfidf', n=5, method='tfidf', metric='cosine'):
    # Ensure the target company exists in the DataFrame
    if target_company not in df['companyName'].values:
        print("Vérifier le nom de l'entreprise.")
        return pd.DataFrame()

    # Get the vector of the target company
    target_vector = df.loc[df['companyName'] == target_company, vector_column].iloc[0]

    # For TF-IDF, convert sparse matrix to dense array if needed
    if method == 'tfidf' and hasattr(target_vector, "toarray"):
        target_vector = target_vector.toarray()[0]
        df = df.copy()  # Avoid modifying the original DataFrame
        df[vector_column] = df[vector_column].apply(lambda x: x.toarray()[0] if hasattr(x, "toarray") else x)

    # Compute similarities or distances
    if metric == 'cosine':
        similarities = cosine_similarity([target_vector], list(df[vector_column]))
        similarity_column = 'Similarity'
        scores = similarities[0]
    elif metric == 'euclidean':
        distances = euclidean_distances([target_vector], list(df[vector_column]))
        similarity_column = 'Distance'
        scores = distances.flatten()
        scores = 1 / (0.0001 + scores)  # Inverting distances to make them similar to similarity scores

    # Create a DataFrame for similar companies
    similar_companies = pd.DataFrame({'companyName': df['companyName'], similarity_column: scores})
    sort_ascending = True if metric == 'euclidean' else False
    similar_companies = similar_companies.sort_values(by=similarity_column, ascending=sort_ascending)

    # Exclude the target company from the results
    similar_companies = similar_companies[similar_companies['companyName'] != target_company]

    return similar_companies.head(n)

# Example usage
result_tfidf_cosine = find_similar_companies("Visa Inc.", data, 'vec_tfidf', n=10, method='tfidf', metric='cosine')
result_tfidf_euclidean = find_similar_companies("Visa Inc.", data, 'vec_tfidf', n=10, method='tfidf', metric='euclidean')

result_word2vec_cosine = find_similar_companies("Visa Inc.", data, 'description_word2vec', n=10, method='word2vec', metric='cosine')
result_word2vec_euclidean = find_similar_companies("Visa Inc.", data, 'description_word2vec', n=10, method='word2vec', metric='euclidean')

print("TF-IDF Cosine Similarity:\n", result_tfidf_cosine)

print("\nWord2Vec Cosine Similarity:\n", result_word2vec_cosine)


TF-IDF Cosine Similarity:
                               companyName  Similarity
18055  BLS International Services Limited    0.415832
1077                            Mogo Inc.    0.311937
4973                           Adyen N.V.    0.292689
916               Mastercard Incorporated    0.292129
5910                           Usio, Inc.    0.268081
13926                            XTM Inc.    0.261167
1153               AppTech Payments Corp.    0.247371
15013                       Isracard Ltd.    0.245550
11669                   Plexian AB (publ)    0.244023
1553                  ACI Worldwide, Inc.    0.231816

Word2Vec Cosine Similarity:
                      companyName  Similarity
916      Mastercard Incorporated    0.901665
1077                   Mogo Inc.    0.900398
5910                  Usio, Inc.    0.889770
28560  GMO Payment Gateway, Inc.    0.889736
2600               EVERTEC, Inc.    0.889080
1975                Fiserv, Inc.    0.886142
887           i3 Verticals, Inc.  

In [86]:
result_tfidf_cosine = find_similar_companies("Apple Inc.", data, 'vec_tfidf', n=5, method='tfidf', metric='cosine')

result_word2vec_cosine = find_similar_companies("Apple Inc.", data, 'description_word2vec', n=5, method='word2vec', metric='cosine')

print("TF-IDF Cosine Similarity:\n", result_tfidf_cosine)

print("\nWord2Vec Cosine Similarity:\n", result_word2vec_cosine)


TF-IDF Cosine Similarity:
                           companyName  Similarity
10351        Apple Rush Company, Inc.    0.581192
2550               Jamf Holding Corp.    0.461622
19301  S P V I Public Company Limited    0.427754
34203                    Simply, Inc.    0.312205
15466             Quest Holdings S.A.    0.296608

Word2Vec Cosine Similarity:
                   companyName  Similarity
9950      Electronic Arts Inc    0.909886
37         Best Buy Co., Inc.    0.907601
35           Amazon.com, Inc.    0.907427
370         T-Mobile US, Inc.    0.903812
29721  Sourcenext Corporation    0.902151


In [None]:
# def from_token_to_ranking(name="Visa Inc."):
#   """
#   fonction de test uniquement, il ne faut pas load le csv comme cela, car il y a des traitements à faire dessus.
#   """

#   df = pd.read_csv("/content/drive/MyDrive/S9/DDEFI/Projet_DDEFI/code/data_tokenized.csv")
#   df.drop(columns='Unnamed: 0', inplace=True)
#   df=df.iloc[:10]

#   df = build_tfidf(df)

#   result_tfidf = find_similar_companies(name, df, 'vec_tfidf', n=20)
#   return result_tfidf

# # from_token_to_ranking("Visa Inc.")
# from_token_to_ranking("Texas Roadhouse, Inc.")
# #from_token_to_ranking("Lamb Weston Holdings, Inc.")
# #from_token_to_ranking("Ferroglobe PLC")

# GPD per Capita bis (world bank)

In [81]:
dict_mapping_name_abb = {
    'Aruba': 'AW',
    'Afghanistan': 'AF',
    'Angola': 'AO',
    'Albania': 'AL',
    'Andorra': 'AD',
    'United Arab Emirates': 'AE',
    'Argentina': 'AR',
    'Armenia': 'AM',
    'American Samoa': 'AS',
    'Antigua and Barbuda': 'AG',
    'Australia': 'AU',
    'Austria': 'AT',
    'Azerbaijan': 'AZ',
    'Burundi': 'BI',
    'Belgium': 'BE',
    'Benin': 'BJ',
    'Burkina Faso': 'BF',
    'Bangladesh': 'BD',
    'Bulgaria': 'BG',
    'Bahrain': 'BH',
    'Bahamas, The': 'BS',
    'Bosnia and Herzegovina': 'BA',
    'Belarus': 'BY',
    'Belize': 'BZ',
    'Bermuda': 'BM',
    'Bolivia': 'BO',
    'Brazil': 'BR',
    'Barbados': 'BB',
    'Brunei Darussalam': 'BN',
    'Bhutan': 'BT',
    'Botswana': 'BW',
    'Central African Republic': 'CF',
    'Canada': 'CA',
    'Switzerland': 'CH',
    'Channel Islands': 'XK',
    'Chile': 'CL',
    'China': 'CN',
    "Cote d'Ivoire": 'CI',
    'Cameroon': 'CM',
    'Congo, Dem. Rep.': 'CD',
    'Congo, Rep.': 'CG',
    'Colombia': 'CO',
    'Comoros': 'KM',
    'Cabo Verde': 'CV',
    'Costa Rica': 'CR',
    'Cuba': 'CU',
    'Curacao': 'CW',
    'Cayman Islands': 'KY',
    'Cyprus': 'CY',
    'Czechia': 'CZ',
    'Germany': 'DE',
    'Djibouti': 'DJ',
    'Dominica': 'DM',
    'Denmark': 'DK',
    'Dominican Republic': 'DO',
    'Algeria': 'DZ',
    'Ecuador': 'EC',
    'Egypt, Arab Rep.': 'EG',
    'Eritrea': 'ER',
    'Spain': 'ES',
    'Estonia': 'EE',
    'Ethiopia': 'ET',
    'Finland': 'FI',
    'Fiji': 'FJ',
    'France': 'FR',
    'Faroe Islands': 'FO',
    'Micronesia, Fed. Sts.': 'FM',
    'Gabon': 'GA',
    'United Kingdom': 'UK',
    'Georgia': 'GE',
    'Ghana': 'GH',
    'Gibraltar': 'GI',
    'Guinea': 'GN',
    'Gambia, The': 'GM',
    'Guinea-Bissau': 'GW',
    'Equatorial Guinea': 'GQ',
    'Greece': 'GR',
    'Grenada': 'GD',
    'Greenland': 'GL',
    'Guatemala': 'GT',
    'Guam': 'GU',
    'Guyana': 'GY',
    'Hong Kong SAR, China': 'HK',
    'Honduras': 'HN',
    'Croatia': 'HR',
    'Haiti': 'HT',
    'Hungary': 'HU',
    'Indonesia': 'ID',
    'Isle of Man': 'IM',
    'India': 'IN',
    'Ireland': 'IE',
    'Iran, Islamic Rep.': 'IR',
    'Iraq': 'IQ',
    'Iceland': 'IS',
    'Israel': 'IL',
    'Italy': 'IT',
    'Jamaica': 'JM',
    'Jordan': 'JO',
    'Japan': 'JP',
    'Kazakhstan': 'KZ',
    'Kenya': 'KE',
    'Kyrgyz Republic': 'KG',
    'Cambodia': 'KH',
    'Kiribati': 'KI',
    'St. Kitts and Nevis': 'KN',
    'Korea, Rep.': 'KR',
    'Kuwait': 'KW',
    'Lao PDR': 'LA',
    'Lebanon': 'LB',
    'Liberia': 'LR',
    'Libya': 'LY',
    'St. Lucia': 'LC',
    'Liechtenstein': 'LI',
    'Sri Lanka': 'LK',
    'Lesotho': 'LS',
    'Lithuania': 'LT',
    'Luxembourg': 'LU',
    'Latvia': 'LV',
    'Macao SAR, China': 'MO',
    'St. Martin (French part)': 'MF',
    'Morocco': 'MA',
    'Monaco': 'MC',
    'Moldova': 'MD',
    'Madagascar': 'MG',
    'Maldives': 'MV',
    'Mexico': 'MX',
    'Marshall Islands': 'MH',
    'North Macedonia': 'MK',
    'Mali': 'ML',
    'Malta': 'MT',
    'Myanmar': 'MM',
    'Montenegro': 'ME',
    'Mongolia': 'MN',
    'Northern Mariana Islands': 'MP',
    'Mozambique': 'MZ',
    'Mauritania': 'MR',
    'Mauritius': 'MU',
    'Malawi': 'MW',
    'Malaysia': 'MY',
    'Namibia': 'NA',
    'New Caledonia': 'NC',
    'Niger': 'NE',
    'Nigeria': 'NG',
    'Nicaragua': 'NI',
    'Netherlands': 'NL',
    'Norway': 'NO',
    'Nepal': 'NP',
    'Nauru': 'NR',
    'New Zealand': 'NZ',
    'Oman': 'OM',
    'Pakistan': 'PK',
    'Panama': 'PA',
    'Peru': 'PE',
    'Philippines': 'PH',
    'Palau': 'PW',
    'Papua New Guinea': 'PG',
    'Poland': 'PL',
    'Puerto Rico': 'PR',
    "Korea, Dem. People's Rep.": 'KP',
    'Portugal': 'PT',
    'Paraguay': 'PY',
    'West Bank and Gaza': 'PS',
    'French Polynesia': 'PF',
    'Qatar': 'QA',
    'Romania': 'RO',
    'Russian Federation': 'RU',
    'Rwanda': 'RW',
    'Saudi Arabia': 'SA',
    'Sudan': 'SD',
    'Senegal': 'SN',
    'Singapore': 'SG',
    'Solomon Islands': 'SB',
    'Sierra Leone': 'SL',
    'El Salvador': 'SV',
    'San Marino': 'SM',
    'Somalia': 'SO',
    'Serbia': 'RS',
    'South Sudan': 'SS',
    'Sao Tome and Principe': 'ST',
    'Suriname': 'SR',
    'Slovak Republic': 'SK',
    'Slovenia': 'SI',
    'Sweden': 'SE',
    'Eswatini': 'SZ',
    'Sint Maarten (Dutch part)': 'SX',
    'Seychelles': 'SC',
    'Syrian Arab Republic': 'SY',
    'Turks and Caicos Islands': 'TC',
    'Chad': 'TD',
    'Togo': 'TG',
    'Thailand': 'TH',
    'Tajikistan': 'TJ',
    'Turkmenistan': 'TM',
    'Timor-Leste': 'TL',
    'Tonga': 'TO',
    'Trinidad and Tobago': 'TT',
    'Tunisia': 'TN',
    'Turkiye': 'TR',
    'Tuvalu': 'TV',
    'Tanzania': 'TZ',
    'Uganda': 'UG',
    'Ukraine': 'UA',
    'Uruguay': 'UY',
    'United States': 'US',
    'Uzbekistan': 'UZ',
    'St. Vincent and the Grenadines': 'VC',
    'Venezuela, RB': 'VE',
    'British Virgin Islands': 'VG',
    'Virgin Islands (U.S.)': 'VI',
    'Vietnam': 'VN',
    'Vanuatu': 'VU',
    'Samoa': 'WS',
    'Kosovo': 'XK',
    'Yemen, Rep.': 'YE',
    'South Africa': 'ZA',
    'Zambia': 'ZM',
    'Zimbabwe': 'ZW'
}

In [None]:
def build_GDP_similarity_matrix():
    file_path = "../Data/GDP per capita (current US$).csv"
    data_GDP = pd.read_csv(file_path)
    data_GDP = data_GDP.rename(columns={'GDP per capita (current US$)':'GDPpc'})
    data_GDP = data_GDP.drop(columns={'Economy Code', 'Year'})
    data_GDP = data_GDP.drop(data_GDP[data_GDP['Economy'].isin(['Eritrea', "Korea, Dem. People's Rep."])].index)
    data_GDP['Economy'] = data_GDP['Economy'].map(dict_mapping_name_abb)
    
    gdp_array = data_GDP['GDPpc'].values
    num_pays = len(gdp_array)
    similarity_matrix = np.zeros((num_pays, num_pays))

    for i in range(num_pays):
        for j in range(num_pays):
            distance = (np.abs(gdp_array[i] - gdp_array[j])) / (gdp_array[i]+gdp_array[j])
            similarity_matrix[i][j] = 1/(1+distance)

    similarity_matrix = pd.DataFrame(similarity_matrix, index=data_GDP['Economy'], columns=data_GDP['Economy'])

    return similarity_matrix

GDP_similarity_matrix = build_GDP_similarity_matrix()

# liste de keywords

- problème du stemming : on propose des stemming ou des tokens entiers ?

In [None]:
all_tokens = df_full['description'].explode().tolist() # Concaténation des listes de tokens
unique_tokens = set(all_tokens) # Création du pool de tokens uniques
token_counts = pd.Series(all_tokens).value_counts() # Calcul du dénombrement de chaque token

In [None]:
len(all_tokens)/df_full.shape[0]  # il y a ne moyenne 85 mots par description

85.56162316607275

In [None]:
token_counts.quantile(0.9999)/df_full.shape[0]*100 # on ne prend que les mots qui apparaissent en moyenne moins de 56% du temps

56.4085190535757

In [None]:
token_counts.quantile(0.7)/df_full.shape[0]*100 # on ne prend que les mots qui apparaissent en moyenne plus de 0.08% du temps

0.006420751870043982

In [None]:
list_keywords = list(token_counts[(token_counts.quantile(0.7)<=token_counts)&(token_counts<=token_counts.quantile(0.9999))].index)

# filtrage par secteur et keywords

- Comment appliquer un stemming sur les keywords tout en proposant les mots en entier ?
- problème de filtrage : diagram de Venn ET/OU ??
- faire une liste de keyword en triant les tokens de descirption par fréquence d'apparition
- une fois qu'on a fait la liste des keywords possibles à partir du dataframe, demander à chatgpt si les keywors peuvent aussi correspondre à d'autres entreprises (dans le cas ou une description est trop pauvre en mots)

In [None]:
THRESHOLD_SIZE_FINAL_GROUP=1000  # taille du groupe que l'on veut avoir pour commencer l'analyse avec les données financières

In [None]:
data = load_preprocess_data_tokenized()
data = build_tfidf(data)
df=data.copy()[:100]

In [None]:
df_full = data.copy()

## Filtrage secteur

In [None]:
df_full

Unnamed: 0,companyName,industry,description,sector,country,vec_tfidf
0,Visa Inc.,Credit Services,"[visa, inc., oper, payment, technolog, compani...",Financial Services,US,"(0, 69507)\t0.04329421873343035\n (0, 28536..."
1,"Texas Roadhouse, Inc.",Restaurants,"[texa, roadhous, inc., togeth, subsidiari, ope...",Consumer Cyclical,US,"(0, 20183)\t0.06172506483002616\n (0, 18440..."
2,"Lamb Weston Holdings, Inc.",Packaged Foods,"[lamb, weston, hold, inc., produc, distribut, ...",Consumer Defensive,US,"(0, 21254)\t0.06293066474975732\n (0, 24134..."
3,Intact Financial Corporation,Insurance—Property & Casualty,"[intact, financi, corpor, subsidiari, provid, ...",Financial Services,CA,"(0, 13404)\t0.03851612061606596\n (0, 78417..."
4,"OPKO Health, Inc.",Diagnostics & Research,"[opko, health, inc., healthcar, compani, engag...",Healthcare,US,"(0, 34610)\t0.03688475698854317\n (0, 29088..."
...,...,...,...,...,...,...
37577,"Global Brokerage, Inc.",Financial Data & Stock Exchanges,"[global, brokerag, inc., subsidiari, provid, o...",Financial Services,US,"(0, 30460)\t0.1793094610567329\n (0, 22281)..."
37580,Prima Industrie SpA,Specialty Industrial Machinery,"[prima, industri, spa, develop, manufactur, ma...",Industrials,IT,"(0, 17803)\t0.10854261371782986\n (0, 62937..."
37581,PT Putra Rajawali Kencana Tbk,Trucking,"[pt, putra, rajawali, kencana, tbk, engag, tra...",Industrials,ID,"(0, 64075)\t0.18442821739381648\n (0, 42205..."
37582,"Agritek Holdings, Inc.",Real Estate Services,"[agritek, hold, inc., distribut, hemp, cannabi...",Real Estate,US,"(0, 49001)\t0.1490741453938388\n (0, 34947)..."


In [None]:
def reduce_df_size(df, sector, list_keywords): 
    """
    retourne df filtré qui contient 2 partie : 
        - les entreprises du même secteur
        - les entreprises des autres secteurs qui contiennent tous les keywords (condition 'AND')
        
    il est important que les keywords dans la liste soient du plus important au moins important
    """
    
    df = df.copy()
    list_keywords = list_keywords.copy()
    filtered_df = df[df.sector==sector] # on ajoute directement les entrerises du même secteur


    # on ajoute les entreprises qui vérifient tous les keywords et qui font partie des autres secteurs
    df_other_sectors = df.loc[df['sector'] != sector].copy()
    
    print('1', filtered_df.shape)

    # cmt=0
    for keyword in list_keywords:
        # cmt+=1
        # print(cmt)
        
        "on ajoute les entreprises des autres secteurs qui vérifient tous les keywords : ON AUGMENTE LA ZONE DE RECHERCHE"
        filtered_df = pd.concat([filtered_df, filter_on_keyword(df_other_sectors, keyword)])

    print('2', filtered_df.shape)
    
    while filtered_df.shape[0]>THRESHOLD_SIZE_FINAL_GROUP:
        "ON DIMINUE LA ZONE DE RECHERCHE : si le df est trop grand, on réduit la taille avec un filtrage conjonctif 'condition 'AND'"
        if len(list_keywords)==0:
            new_keyword = input("Il n'y a pas assez de keyword pour réduire la zone de recherche (TAILLE ZONE = {}/TAILLE MAX = {}), veuillez en ajouter un : ".format(filtered_df.shape[0], THRESHOLD_SIZE_FINAL_GROUP))
            list_keywords.append(new_keyword)
        keyword = list_keywords.pop(0)
        filtered_df = filter_on_keyword(filtered_df, keyword)
        
    print('3', filtered_df.shape)

    return filtered_df



def filter_on_keyword(df, keyword):
    resultat = df[df['description'].apply(lambda x: keyword in x)]
    return resultat

In [None]:
df_full.sector.unique()

array(['Financial Services', 'Consumer Cyclical', 'Consumer Defensive',
       'Healthcare', 'Basic Materials', 'Technology', 'Energy',
       'Utilities', 'Industrials', 'Communication Services',
       'Real Estate', nan], dtype=object)

In [None]:
import random

# test_list_keyword = random.sample(list_keywords, 15)
# test_list_keyword = [
#     # 'company', 
#                    # 'inc.', 'product',
#                     'produces', 
#                    'healthcare'
#                   ]

test_list_keyword = ['Utilities', 'Communication Services']






test_sector = 'Industrials'

filtered_df = reduce_df_size(df_full, sector=test_sector, list_keywords=test_list_keyword)
filtered_df

1 (5293, 6)
2 (7356, 6)
3 (640, 6)


Unnamed: 0,companyName,industry,description,sector,country,vec_tfidf
18,Xcel Energy Inc.,Utilities—Regulated Electric,"[xcel, energi, inc., subsidiari, generat, purc...",Utilities,US,"(0, 81195)\t0.0761761948764454\n (0, 50486)..."
45,PG&E Corporation,Utilities—Regulated Electric,"[pg, e, corpor, subsidiari, pacif, gas, electr...",Utilities,US,"(0, 702)\t0.11030611729742826\n (0, 3964)\t..."
59,"American Electric Power Company, Inc.",Utilities—Regulated Electric,"[american, electr, power, compani, inc., elect...",Utilities,US,"(0, 17895)\t0.14865691524689953\n (0, 703)\..."
61,Emera Incorporated,Utilities—Regulated Electric,"[emera, incorpor, energi, servic, compani, sub...",Utilities,CA,"(0, 33858)\t0.1011243161084195\n (0, 3755)\..."
67,Brookfield Renewable Corporation,Utilities—Renewable,"[brookfield, renew, corpor, own, oper, portfol...",Utilities,US,"(0, 2093)\t0.3154653083496897\n (0, 17836)\..."
...,...,...,...,...,...,...
37347,PETRONAS Gas Berhad,Utilities—Regulated Gas,"[petrona, gas, berhad, oper, gas, infrastructu...",Utilities,MY,"(0, 59472)\t0.097547969435249\n (0, 31143)\..."
37423,New Energy Solar Limited,Utilities—Renewable,"[new, energi, solar, limit, acquir, own, manag...",Utilities,AU,"(0, 7711)\t0.10777822145067113\n (0, 68269)..."
37448,PT. Terregra Asia Energy Tbk,Utilities—Renewable,"[pt, terregra, asia, energi, tbk, focus, devel...",Utilities,ID,"(0, 49119)\t0.1975860383144303\n (0, 76794)..."
37501,Volt Power Group Limited,Utilities—Renewable,"[volt, power, group, limit, togeth, subsidiari...",Utilities,AU,"(0, 42374)\t0.18883685613398693\n (0, 25786..."


# Join description and GDP similarities

In [None]:
data = load_preprocess_data_tokenized()
data = build_tfidf(data)
df=data.copy()[:100]

In [None]:
# df

In [None]:
def find_similar_companies(target_company, df, vector_column='vec_tfidf', n=5, similarity='cosine'):
    df=df.copy()
    df[vector_column] = df[vector_column].apply(lambda x: x.toarray()[0])
    target_vector = df.loc[df['companyName'] == target_company][vector_column]

    if target_vector.shape[0]>0:
        target_vector = target_vector.iloc[0]
    else:
        print("Vérifier le nom de l'entreprise.")
        return -1

    if similarity == 'cosine':
        similarities = cosine_similarity([target_vector], list(df[vector_column]))

        similar_companies = pd.DataFrame({'companyName': df['companyName'], 'Similarity': similarities[0], 'country': df['country']})
        similar_companies = similar_companies.sort_values(by='Similarity', ascending=False)

    return similar_companies

# find_similar_companies(target_company = 'Visa Inc.', df=df)

In [None]:
# ranking_similarity = find_similar_companies('Visa Inc.', df)
# display(ranking_similarity)

def build_similarity_GDP_column(ranking_similarity_description):
    
    ranking_similarity = ranking_similarity_description.copy()
                    
    GDP_similarity_matrix = build_GDP_similarity_matrix()
    dict_similarity_GDP = {}

    for index, row in ranking_similarity.iterrows():
        entreprise_similaire = row['companyName']
        pays_entreprise_similaire = row['country']
        distance_GDP = GDP_similarity_matrix.loc[ranking_similarity.iloc[0]['country'], pays_entreprise_similaire]  # le pays cible est : ranking_similarity.iloc[0]['country']
        dict_similarity_GDP[entreprise_similaire] = distance_GDP

    ranking_similarity['similarity_GDP'] = [dict_similarity_GDP[entreprise] for entreprise in ranking_similarity['companyName']]
    ranking_similarity.drop(columns=['country'], inplace=True)

    return ranking_similarity

In [None]:
def combine_similarities(ranking_similarity, beta=0.5):
    df=ranking_similarity.copy()
    df['Similarity'] = (1/(1+beta))*(df['Similarity']+beta*df['similarity_GDP'])
    df = df.drop(columns={'similarity_GDP'})
    return df

In [None]:
def compute_similarity(df, target_name='Visa Inc.'):
    df = df.copy()
    ranking_similarity_description = find_similar_companies(target_name, df)
    ranking_similarity_description_and_GDP = build_similarity_GDP_column(ranking_similarity_description)
    return combine_similarities(ranking_similarity_description_and_GDP, beta=0.5)

In [None]:
filtered_df.iloc[0].companyName

'Xcel Energy Inc.'

In [None]:
compute_similarity(filtered_df, target_name='Xcel Energy Inc.')

Unnamed: 0,companyName,Similarity
18,Xcel Energy Inc.,1.000000
1211,Black Hills Corporation,0.679636
2407,"CenterPoint Energy, Inc.",0.663868
746,Duke Energy Corporation,0.662362
960,Exelon Corporation,0.661961
...,...,...
14433,Agripower France SA,0.266533
17290,Dlaboratory Sweden AB (publ),0.299005
10717,Ekopak NV,0.284765
5006,Companhia de Saneamento de Minas Gerais,0.193931
