# METHOD 1

In [2]:

import pandas as pd
import spacy
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os



In [4]:
# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")

# Load Spacy Model for preprocessing
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
def preprocess_text(doc):
    return [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]

# Tokenize and preprocess texts
df_train['tokens'] = df_train['Target Business Description'].apply(nlp).apply(preprocess_text)
df_capabilities['tokens'] = df_capabilities['Technologies/Skills'].astype(str).apply(nlp).apply(preprocess_text)

In [6]:
# Train Word2Vec model
model = Word2Vec(df_train['tokens'].tolist() + df_capabilities['tokens'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

In [7]:
# Function to create averaged word vector for phrases
def vectorize(tokens):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Vectorize capabilities
capability_vectors = np.array([vectorize(tokens) for tokens in df_capabilities['tokens']])

In [8]:
# Identify new capabilities
new_capabilities = set()
similarity_threshold = 0.5  # Adjust as necessary

for tokens in df_train['tokens']:
    for token in tokens:
        token_vec = vectorize([token])
        if token_vec.any():
            sims = cosine_similarity([token_vec], capability_vectors)
            if all(sim < similarity_threshold for sim in sims[0]):
                new_capabilities.add(token)

print("New capabilities to consider adding:", new_capabilities)

New capabilities to consider adding: {'saddlehorn', 'manali', 'chuo', 'isimaren', 'fig', 'denbury', 'greenagro', 'inversora', 'castor', 'bangladesh', 'tw', 'noordzeewind', 'cheshire', 'cupiagua', 'aberdeen', 'adco', 'gi', 'totalenergies', 'archaea', 'creation', 'liege', 'dow', 'nbl', 'neustadt', 'convergent', 'zap', 'vinythai', 'polyprophylene', 'isemaren', 'cryogenic', 'companhia', 'lights', 'raiznext', 'suncor', 'biobutanol', 'comillas', 'coogee', 'pitpoint', 'durango', 'ferro', 'obligation', 'energas', 'phosphate', 'eren', 'jehier', 'mahalo', 'reform', 'redivivus', 'tripoli', 'zhuhai', 'solarbridge', 'strasbourg', 'adani', 'hc', 'oyj', 'bonaparte', 'gray', 'courbevoie', 'enerkem', 'moon', 'rontec', 'aspirant', 'emulsion', 'mereenie', 'polythene', 'palmerston', 'b', 'sapa', 'woven', 'soypower', 'gunflint', 'refinement', 'iec', 'franklin', 'origin', 'limejump', 'samil', 'labo', 'pearl', 'ecooils', 'basell', 'sunseap', 'lubrilog', 'bardahl', 'dublin', 'gevo', 'frankfurt', 'engenharia',

In [9]:
new_capabilities 


{'aarhus',
 'aasta',
 'aberdeen',
 'accumulation',
 'acetyl',
 'activities',
 'acure',
 'adani',
 'adco',
 'additives',
 'afab',
 'albacete',
 'alfa',
 'alges',
 'alkalis',
 'ampere',
 'anchor',
 'andgas',
 'anglo',
 'angolan',
 'antrim',
 'apico',
 'aps',
 'aqua',
 'araxa',
 'archaea',
 'argentino',
 'arizona',
 'asahi',
 'asparuh',
 'aspirant',
 'attiki',
 'avon',
 'b',
 'bakery',
 'bangalore',
 'bangladesh',
 'bardahl',
 'basell',
 'bay',
 'beach',
 'belem',
 'belemaoil',
 'bengal',
 'beringen',
 'biobutanol',
 'bioforming',
 'biojet',
 'biorefinery',
 'blueprint',
 'bonaparte',
 'bongkot',
 'booster',
 'bottleneck',
 'boulder',
 'brake',
 'brasiliano',
 'brenplast',
 'brigham',
 'bristol',
 'bucuresti',
 'budapest',
 'bunnehaven',
 'burshane',
 'cagliari',
 'cairn',
 'calpam',
 'calysta',
 'canaport',
 'canbriam',
 'carr',
 'cascade',
 'castor',
 'catan',
 'cheshire',
 'choay',
 'chocolate',
 'cholburi',
 'chujin',
 'chuo',
 'church',
 'ciloger',
 'cipolletti',
 'cleam',
 'clearspa

# METHOD 2

In [4]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")


# Load Spacy Model for advanced preprocessing
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Load a pre-trained Sentence Transformer model (BERT-based)
model = SentenceTransformer('all-MiniLM-L6-v2')

def preprocess_text(doc):
    # Add or remove custom stop words
    custom_stop_words = ['example', 'business', 'company']  # Customize this list
    return [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and token.text.lower() not in custom_stop_words]

# Process text data
df_train['tokens'] = df_train['Target Business Description'].apply(nlp).apply(preprocess_text)
df_capabilities['tokens'] = df_capabilities['Technologies/Skills'].astype(str).apply(nlp).apply(preprocess_text)

# Create embeddings for each set of tokens
capability_embeddings = np.array([model.encode(' '.join(tokens)) for tokens in df_capabilities['tokens'] if tokens])
train_embeddings = [model.encode(' '.join(tokens)) for tokens in df_train['tokens'] if tokens]

# Identifying new capabilities
new_capabilities = set()
similarity_threshold = 0.5  # Adjust as necessary

for idx, emb in enumerate(train_embeddings):
    sims = cosine_similarity([emb], capability_embeddings)
    if all(sim < similarity_threshold for sim in sims[0]):
        # Add all unique tokens from the description as potential capabilities
        new_capabilities.update(df_train.iloc[idx]['tokens'])

print("New capabilities to consider adding:", new_capabilities)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

New capabilities to consider adding: {'alpha', 'seismic', 'industrial', 'lac', 'portfolio', 'paso', 'metabolite', 'preparation', 'rome', 'rare', 'chain', 'thousand', 'guallarauco', 'republique', 'pallet', 'kkd', 'glue', 'hive', 'found', 'bank', 'atlas', 'danske', 'tranzen', 'drill', 'aarhus', 'cristina', 'investment', 'gerber', 'austrian', 'inorganic', 'corporation', 'dress', 'dd', 'polystyrol', 'morrovalle', 'manali', 'ldpe', 'oil', 'clichy', 'labeling', 'liability', 'scudder', 'domestic', 'tesoro', 'prncipe', 'chujin', 'lloydminster', 'szoftverfejleszto', 'sagaz', 'olaya', 'exist', 'sensing', 'moscow', 'counter', 'preservation', 'salvo', 'brooklyn', 'alabama', 'ingenieros', 'slit', 'fukui', 'cekomastik', 'kigali', 'pta', 'united', 'tube', 'polska', 'liquefied', 'sensitive', 'ornamental', 'polybutylene', 'vacuum', 'tuk', 'everyday', 'temperature', 'agroindustria', 'weather', 'government', 'physics', 'onda', 'combine', 'alta', 'special', 'ecoplac', 'way', 'zhejiang', 'pakistan', 'reven

# METHOD 3

In [1]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")

# Load Spacy Model with NER
nlp = spacy.load('en_core_web_sm')

# Load a pre-trained Sentence Transformer model (BERT-based)
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_phrases(text):
    doc = nlp(text)
    # Extract N-grams and named entities
    phrases = set([token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha])
    phrases.update([" ".join(token.lemma_.lower() for token in entity) for entity in doc.ents])  # Named entities
    n_grams = [doc[i:i+3].text.lower() for i in range(len(doc)-2)]  # Trigrams
    n_grams.extend([doc[i:i+2].text.lower() for i in range(len(doc)-1)])  # Bigrams
    phrases.update(n_grams)
    return list(phrases)

# Apply phrase extraction
df_train['phrases'] = df_train['Target Business Description'].apply(extract_phrases)
df_capabilities['phrases'] = df_capabilities['Technologies/Skills'].astype(str).apply(extract_phrases)

# Flatten and vectorize the capabilities
flat_capabilities = [item for sublist in df_capabilities['phrases'] for item in sublist]
capability_embeddings = np.array([model.encode(phrase) for phrase in flat_capabilities if phrase.strip()])

# Identify new capabilities
new_capabilities = set()
similarity_threshold = 0.1  # Adjust based on your results

for phrases in df_train['phrases']:
    for phrase in phrases:
        if phrase.strip():
            phrase_emb = model.encode(phrase)
            sims = cosine_similarity([phrase_emb], capability_embeddings)
            if all(sim < similarity_threshold for sim in sims[0]):
                new_capabilities.add(phrase)

print("New capabilities to consider adding:", new_capabilities)


  from tqdm.autonotebook import tqdm, trange


New capabilities to consider adding: {'dakota. the', 'global portfolio', 'mfc', 'brasilia and', 'dakota within', 'principal creditos hipotecarios', 'in kitimat', 'roncador field of', 'august 2011', 'and peroxides', 'gaz', 'in angola', 'pallonji infrastructure capital', '220,000 barrel', 'americas inc', '(wilga park', 'in athens,', 'resinas sl is', 'enerji sanayi ve', 'located in bogota', 'fibers brand names', 'and two breweries', ', home', 'led sl is', 'omv ag', 'caarapo ltda', 'hindustan', 'portugal, provides', 'company produces acrylonitrile', 'to 1,200 mm', 'blocks included block', 'located in ijmuiden', 'in saskatchewan', 'by equinor', 'in libya', 'renouvelables sarl is', 'bv will', 'venture between mitsubishi', 'asia) pcl', 'valdesolar hive sl', 'in palo', 'portfolio includes such', ', 1954', 'of equinor brasil', 'pune,', 'gerber products', 'founded on march', 'brasiliano distribuidora sa', 'michigan, us', 'located in accra', 'pcl, located', 'bac thor', 'publisher. it', '15th conv

In [2]:
new_capabilities

{'dakota. the',
 'global portfolio',
 'mfc',
 'brasilia and',
 'dakota within',
 'principal creditos hipotecarios',
 'in kitimat',
 'roncador field of',
 'august 2011',
 'and peroxides',
 'gaz',
 'in angola',
 'pallonji infrastructure capital',
 '220,000 barrel',
 'americas inc',
 '(wilga park',
 'in athens,',
 'resinas sl is',
 'enerji sanayi ve',
 'located in bogota',
 'fibers brand names',
 'and two breweries',
 ', home',
 'led sl is',
 'omv ag',
 'caarapo ltda',
 'hindustan',
 'portugal, provides',
 'company produces acrylonitrile',
 'to 1,200 mm',
 'blocks included block',
 'located in ijmuiden',
 'in saskatchewan',
 'by equinor',
 'in libya',
 'renouvelables sarl is',
 'bv will',
 'venture between mitsubishi',
 'asia) pcl',
 'valdesolar hive sl',
 'in palo',
 'portfolio includes such',
 ', 1954',
 'of equinor brasil',
 'pune,',
 'gerber products',
 'founded on march',
 'brasiliano distribuidora sa',
 'michigan, us',
 'located in accra',
 'pcl, located',
 'bac thor',
 'publisher. 

# METHOD 4

In [3]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bars

# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")

# Load Spacy Model with NER
nlp = spacy.load('en_core_web_sm')

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_phrases(text):
    doc = nlp(text)
    # Generate meaningful n-grams and specific named entities
    phrases = set()
    for ent in doc.ents:  # Use named entities
        if ent.label_ in ['ORG', 'PRODUCT', 'GPE', 'LOC']:  # Filter for specific entity types
            phrases.add(ent.text.lower())

    # Generate n-grams filtering out stop words and punctuation
    n_grams = [" ".join([token.lemma_ for token in doc[i:i+3] if token.is_alpha and not token.is_stop]) for i in range(len(doc)-2)]
    n_grams.extend([" ".join([token.lemma_ for token in doc[i:i+2] if token.is_alpha and not token.is_stop]) for i in range(len(doc)-1)])
    phrases.update(filter(None, n_grams))  # Filter empty strings

    return list(phrases)

# Process text data
df_train['phrases'] = df_train['Target Business Description'].apply(extract_phrases)
df_capabilities['phrases'] = df_capabilities['Technologies/Skills'].astype(str).apply(extract_phrases)

# Flatten and vectorize capabilities
flat_capabilities = [item for sublist in df_capabilities['phrases'] for item in sublist]
capability_embeddings = np.array([model.encode(phrase) for phrase in tqdm(flat_capabilities, desc='Encoding Capabilities') if phrase.strip()])

# Identify new capabilities
new_capabilities = set()
similarity_threshold = 0.2  # Lower threshold to be more selective

for phrases in tqdm(df_train['phrases'], desc='Comparing Phrases'):
    for phrase in phrases:
        if phrase.strip():
            phrase_emb = model.encode(phrase)
            sims = cosine_similarity([phrase_emb], capability_embeddings)
            if all(sim < similarity_threshold for sim in sims[0]):
                new_capabilities.add(phrase)

print("New capabilities to consider adding:", new_capabilities)


Encoding Capabilities: 100%|██████████| 692/692 [00:11<00:00, 62.13it/s]
Comparing Phrases: 100%|██████████| 2532/2532 [24:59<00:00,  1.69it/s]

New capabilities to consider adding: {'Chipmunk', 'Montpellier', 'Kentucky Mississippi', 'Midwestern United States', 'Kentucky New', 'Holding Germany II', 'health education', 'Minnetonka', 'Offloading Florence', 'provide online information', 'China National', 'locate Peru', 'Cruz del', 'Dynachisso Thai', 'Poland locate', 'block Kosmos', 'Light Gold', 'XIX Neptun', 'Zierbena Bizkaia', 'Budapest', 'locate France', 'toreador hungary ltd', 'locate Sofia', 'Minnetonka Minnesota', 'southern United States', 'brand include San', 'locate Feldkirch', 'treasury banking', 'locate Louisville', 'West Australia', 'Cruz De La', 'Settimo Torinese', 'Nagaoka Karyoku', 'Doha Qatar', 'select asset', 'border Norway', 'locate Mississauga', 'Bagnolet France', 'Montney northeast', 'Romans France', 'Brazil Azerbaijan', 'locate Frankfurt', 'Mendoza Argentina', 'Papua Indonesia', 'Associati Societa di', 'Block Romania', 'locate Centennial', 'comprise Vibo', 'Holding BVBA', 'Seri Begawan', 'Midwest', 'des Mille',




In [4]:
new_capabilities

{'Chipmunk',
 'Montpellier',
 'Kentucky Mississippi',
 'Midwestern United States',
 'Kentucky New',
 'Holding Germany II',
 'health education',
 'Minnetonka',
 'Offloading Florence',
 'provide online information',
 'China National',
 'locate Peru',
 'Cruz del',
 'Dynachisso Thai',
 'Poland locate',
 'block Kosmos',
 'Light Gold',
 'XIX Neptun',
 'Zierbena Bizkaia',
 'Budapest',
 'locate France',
 'toreador hungary ltd',
 'locate Sofia',
 'Minnetonka Minnesota',
 'southern United States',
 'brand include San',
 'locate Feldkirch',
 'treasury banking',
 'locate Louisville',
 'West Australia',
 'Cruz De La',
 'Settimo Torinese',
 'Nagaoka Karyoku',
 'Doha Qatar',
 'select asset',
 'border Norway',
 'locate Mississauga',
 'Bagnolet France',
 'Montney northeast',
 'Romans France',
 'Brazil Azerbaijan',
 'locate Frankfurt',
 'Mendoza Argentina',
 'Papua Indonesia',
 'Associati Societa di',
 'Block Romania',
 'locate Centennial',
 'comprise Vibo',
 'Holding BVBA',
 'Seri Begawan',
 'Midwest',

# METHOD 5

In [5]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")

# Load Spacy Model with NER
nlp = spacy.load('en_core_web_sm')

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_phrases(text):
    doc = nlp(text)
    phrases = set()
    for ent in doc.ents:  # Focus on entities typically indicative of capabilities
        if ent.label_ in ['PRODUCT', 'ORG']:
            phrases.add(ent.text.lower())

    # Generate n-grams and filter non-capability words
    allowed_pos_types = {'NOUN', 'PROPN', 'ADJ'}
    n_grams = [" ".join([token.lemma_ for token in doc[i:i+3] if token.pos_ in allowed_pos_types and not token.is_stop]) for i in range(len(doc)-2)]
    n_grams.extend([" ".join([token.lemma_ for token in doc[i:i+2] if token.pos_ in allowed_pos_types and not token.is_stop]) for i in range(len(doc)-1)])
    phrases.update(filter(None, n_grams))  # Filter empty strings
    return list(phrases)

df_train['phrases'] = df_train['Target Business Description'].apply(extract_phrases)
df_capabilities['phrases'] = df_capabilities['Technologies/Skills'].astype(str).apply(extract_phrases)

flat_capabilities = [item for sublist in df_capabilities['phrases'] for item in sublist]
capability_embeddings = np.array([model.encode(phrase) for phrase in tqdm(flat_capabilities, desc='Encoding Capabilities') if phrase.strip()])

new_capabilities = set()
similarity_threshold = 0.3  # Lower threshold to be more selective

for phrases in tqdm(df_train['phrases'], desc='Comparing Phrases'):
    for phrase in phrases:
        if phrase.strip():
            phrase_emb = model.encode(phrase)
            sims = cosine_similarity([phrase_emb], capability_embeddings)
            if all(sim < similarity_threshold for sim in sims[0]):
                new_capabilities.add(phrase)

print("New capabilities to consider adding:", new_capabilities)


Encoding Capabilities: 100%|██████████| 685/685 [00:09<00:00, 71.20it/s]
Comparing Phrases: 100%|██████████| 2532/2532 [18:42<00:00,  2.26it/s]

New capabilities to consider adding: {'Chipmunk', 'Kentucky New', 'Kentucky Mississippi', 'Midwestern United States', 'Holding Germany II', 'health education', 'Minnetonka', 'Offloading Florence', 'China National', 'Sur field', 'Cruz del', 'Dynachisso Thai', 'block Kosmos', 'XIX Neptun', 'Zierbena Bizkaia', 'Budapest', 'toreador hungary ltd', 'southern United States', 'treasury banking', 'West Australia', 'Cruz De La', 'Settimo Torinese', 'Nagaoka Karyoku', 'Doha Qatar', 'border Norway', 'Bagnolet France', 'Montney northeast', 'Romans France', 'Brazil Azerbaijan', 'Mendoza Argentina', 'Papua Indonesia', 'Associati Societa di', 'Block Romania', 'the fuerte sur', 'Holding BVBA', 'Seri Begawan', 'Midwest', 'des Mille', 'Ibereolica Renovables', 'Chernichny block', 'Uttar Pradesh', 'Toreador Hungary Ltd', 'Participacoes Sociais SA', 'portfolio Isemaren', 'Netherlands Norway', 'Aparecida Taboado', 'Vadodara Gujarat', 'Reading UK', 'burglary insurance', 'Anglo Siberian', 'Beringen', '12/3/257




# METHOD 6

In [1]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load Spacy Model with technical terms awareness (if available)
nlp = spacy.load('en_core_web_sm')

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Custom function to filter out terms
def filter_terms(phrase_list):
    technical_terms = {'rigs', 'drilling', 'logging', 'simulation', 'seismic', 'surveys', 'management', 'engineering', 'production'}
    filtered = [phrase for phrase in phrase_list if any(term in phrase for term in technical_terms)]
    return filtered

# Function to extract phrases based on observed patterns in the capabilities list
def extract_phrases(text):
    doc = nlp(text)
    phrases = set()
    # Extract phrases based on part-of-speech tagging and named entities
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) <= 3:  # Limiting chunk length to filter out longer non-relevant phrases
            phrases.add(chunk.text.lower())

    # Filtering out non-technical terms
    phrases = filter_terms(list(phrases))
    return phrases

# Load data
df_train = pd.read_excel("TrainingSet.xlsx")
df_capabilities = pd.read_excel("List of capabilities.xlsx")

# Extract phrases from existing capabilities
df_capabilities['phrases'] = df_capabilities['Technologies/Skills'].astype(str).apply(extract_phrases)

# Flatten all capabilities into a single list and encode them
flat_capabilities = [item for sublist in df_capabilities['phrases'] for item in sublist]
capability_embeddings = np.array([model.encode(phrase) for phrase in tqdm(flat_capabilities, desc='Encoding Capabilities') if phrase.strip()])

# Process the training set descriptions
df_train['phrases'] = df_train['Target Business Description'].apply(extract_phrases)

# Identify new capabilities
new_capabilities = set()
similarity_threshold = 0.8  # Adjust based on sensitivity needs

for phrases in tqdm(df_train['phrases'], desc='Comparing Phrases'):
    for phrase in phrases:
        if phrase.strip():
            phrase_emb = model.encode(phrase)
            sims = cosine_similarity([phrase_emb], capability_embeddings)
            if all(sim < similarity_threshold for sim in sims[0]):
                new_capabilities.add(phrase)

print("New capabilities to consider adding:", new_capabilities)

  from tqdm.autonotebook import tqdm, trange
Encoding Capabilities: 100%|██████████| 58/58 [00:01<00:00, 30.36it/s]
Comparing Phrases: 100%|██████████| 2532/2532 [00:11<00:00, 211.08it/s]

New capabilities to consider adding: {'power production', 'chlorine production facility', 'oil drilling equipment', 'well drilling services', 'petroleum production', '2017 forecast production', 'marine seismic data', 'facility management service', 'exploratory drilling', 'early production facilities', 'management', 'primary production', 'retail gasoline;petroleum production', '1994and first production', 'production licence pl093', 'productions', 'long-term management', 'production romania ltd', 'integrated petrochemical production', 'investment management services', 'production optimization', 'drilling programs', 'pipeline engineering company', 'retail petroleum production', 'a combined production', 'engineering services', 'construction management', 'increasing production', 'a daily production', 'electricity production facilities', 'the management', 'production engineering', 'bioengineering', 'order management', 'two production mines', 'genetic engineering software', 'engineering co lt




In [2]:
len(new_capabilities)

75

In [4]:
# save new capabilities to a excel file
df_new_capabilities = pd.DataFrame(list(new_capabilities), columns=['New Capabilities'])
df_new_capabilities.to_excel('/Users/aaryanshah/Oncampus-Job/NLP_Gal/iterations/iteration4.xlsx', index=False)