In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 04.1 - Clustering Analysis English - biased (workpaper)
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): joblib file
#-- Description:  
#                - CLustering analysis on Loans and TCs documents in English
#                
#                
#                
#
#-- @authors:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  0.8
#-- Last Update: 02/09/2021
#-- Last Revision Date: 10/20/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import os
import joblib
import re, numpy as np, pandas as pd
from pprint import pprint

import pickle

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

In [None]:
data_dir

### Load the Data

#### English TCs and Loans containing texts lemmatized:

In [None]:
# load English documents: TCs and Loans
df_base = joblib.load('./output/nlp_df_result_tokens_terms_2021-01-22_english_v1.2.joblib.bz2')

In [None]:
df_base.head()

In [None]:
data_lemmatized = df_base.alt2_data_lemmatized.to_list()

In [None]:
#pprint(df_base[df_base.OPERATION_NUMBER == 'NI-T1268']['extracted'])
#df_base['extracted'][470]
df_base['alt2_terms'][470]

#### Specialized documents:

In [None]:
# load Specialized documents 
df_specialized = joblib.load('./output/nlp_spec_docs_2021-02-08_english_v12_final.joblib.bz2')

In [None]:
df_specialized

In [None]:
# update 02/11: after several runs, the "dig_paper" is removed from the list of specialized documents
df_specialized.drop(df_specialized.tail(1).index,inplace=True) # drop last row

In [None]:
# Additional clean-up since some tokens and terms were flagged to be removed when doing the clustering analysis:
flagged_terms = ['-PRON-', '_', 'aadt', 'aaf', 'aaps', 'aastaraamatu', 'ababa', 'abac', 'abbreviation', 'abc', 'abd', \
                 'abovementioned', 'aforementioned', 'abraham', 'abrams', 'abs', 'zzz', 'ºc', 'õppetunnid', 'ˇthi', 'μs', '_recommendation', \
                 '→_insufficient', '→_insufficient', '→_weak', '→_weak', 'resolution_→', 'resolution_→', '→_weak_strategic_integral_management', '→_weak_strategic_integral_management', '→_lack', '→_lack', 'sector_→', 'sector_→', 'sector_→', 'private_sector_→', 'private_sector_→', 'private_sector_→', '→_weak_strategic_integral', '→_weak_strategic_integral', '→_lack_of_specialist', '→_lack_of_specialist', '→_weak_strategic', '→_weak_strategic', \
                 'framework_©_customer', 'framework_©_customer', 'framework_©', 'framework_©', 'framework_©', 'digital_transformation_framework_©_customer_insight', 'digital_transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'transformation_framework_©_customer_insight', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'digital_transformation_framework_©', 'framework_©_customer_insight', 'framework_©_customer_insight', 'use_digital_transformation_framework_©', 'use_digital_transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', 'transformation_framework_©', '©_customer_insight', '©_customer_insight', 'digital_transformation_framework_©_customer', 'digital_transformation_framework_©_customer', 'transformation_framework_©_customer_insight_customer', 'transformation_framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'framework_©_customer_insight_customer', 'transformation_framework_©_customer', 'transformation_framework_©_customer', '©_customer', '©_customer', '©_customer_insight_customer', '©_customer_insight_customer', 'use_digital_transformation_framework_©_customer', 'use_digital_transformation_framework_©_customer', \
                 'μs', 'offset_of_μs']


for index, row in df_specialized.iterrows():
    #print('Processing index:', str(index))
    df_specialized.at[index, 'alt2_data_lemmatized'] = [word for word in df_specialized.alt2_data_lemmatized[index] if word not in flagged_terms]
    
# Additional terms were identified below and added to the to_remove list:
for index, row in df_specialized.iterrows():
    #print('Processing index:', str(index))
    df_specialized.at[index, 'alt2_data_lemmatized'] = [word for word in df_specialized.alt2_data_lemmatized[index] if word not in to_remove]
    

In [None]:
specialized_docs = df_specialized.alt2_data_lemmatized.to_list()

In [None]:
#####

In [None]:
# Innovation
innovacion_list = ['3-d_printing', '3d_print', '3d_printing', '4ir', '4ri', '5_g', '5g', 'adoption_artificial_intelligence_solution', 'ai', 'analytical_product_generation', 'analytical_tool', 'applied_blockchain', 'artificial_intelligence', 'artificial_intelligence_adoption', 'artificial_intelligence_development', 'artificial_intelligence_solution', 'artificial_intelligence_technology', 'augmented_reality', 'autonomous_car', 'autonomous_vehicle', 'autonomous_vehicles', 'big_data', 'big_tool_development', 'bigdata', 'bim', 'biotechnology_company', 'block_chain', 'blockchain', 'blockchain_pilot', 'blockchain_pilot_project', 'blockchain_technology', 'bot', 'building_information_modeling', 'business_digitization_process', 'business_intelligence_tool', 'captures_satellite_information', 'chatbot', 'cloud_computing', 'cloud_digital_signature', 'cloud_scalability', 'communication_network_infrastructure', 'computational_science', 'connected_vehicles', 'critical_network_infrastructure', 'cryptocurrency', 'data_analytical_tool', 'data_governance', 'data_government', 'data_science', 'development_innovation', 'development_satellite_technology', 'digital_affidavit', 'digital_agenda', 'digital_broker', 'digital_business', 'digital_challenge', 'digital_commerce', 'digital_company', 'digital_company_development_support', 'digital_control', 'digital_economy', 'digital_economy_regulation', 'digital_ecosystem', 'digital_employment', 'digital_entrepreneurship', 'digital_environment', 'digital_governance', 'digital_government_reform', 'digital_hub', 'digital_identification', 'digital_identity', 'digital_identity_development', 'digital_identity_system_implementation', 'digital_infrastructure', 'digital_innovation_hub', 'digital_inspection_process', 'digital_medical_record', 'digital_procedure', 'digital_reform', 'digital_reform_public_administration', 'digital_sale', 'digital_signature', 'digital_signature_implementation', 'digital_single_window', 'digital_strategy', 'digital_transformation', 'digital_transformation_process', 'digital_transformation_roadmaps', 'digital_transformation_strategy', 'digital_visibility', 'disruptive_technology', 'drone', 'drone_airspace', 'drone_incorporation', 'drone_integration', 'drone_regulation', 'e_-_government', 'e_-_learning', 'e-commerce', 'e-commerce_development', 'e-government', 'e-learning', 'education_digital_transformation_process', 'electronic_government', 'electronic_medical_record', 'electronic_signature', 'emerging_digital_technology', 'emerging_technology', 'fintech', 'geospatial_data_base', 'ia', 'innovation', 'innovative_public_procurement', 'internet_thing', 'internet_things', 'internet_thing', 'internet_things', 'iot', 'machine_learning', 'micro_computer', 'nanotechnology', 'network_infrastructure', 'open_government_data', 'outcome-driven_innovation', 'predictive_model', 'predictive_model_development', 'print_3d', 'quantum_computing', 'robot_process_automation', 'robotic_process_automation', 'rpa', 'satelite', 'satellite_control', 'satellite_data', 'satellite_information_user', 'satellite_monitoring', 'satellite_origin_information', 'satellite_technology', 'satellites', 'sector_digital_transformation_support', 'smart_citie', 'smart_citie_implementation', 'smart_cities', 'smart_city', 'smart_contract', 'smart_contract_applied', 'smart_system', 'solar_energy_system', 'spatial_data_infrastructure', 'support_business_digital_transformation', 'technological_innovation', 'use_geospatial_data', 'virtual_reality']

innovacion_list = list(sorted(innovacion_list))
sorted(innovacion_list)

In [None]:
sistemas_list = ['it_infrastructure', 'it_solution', 'it_support_system_strengthening', 'it_tool_implementation', 'informatic_team', 'web_application', 'web_platform', 'app_development', 'app_development', 'application_design', 'automated_information_system', 'automated_monitoring_system', 'cell_phone_data', 'communication_network', 'communication_protocol', 'communication_software', 'computer_application', 'computer_application', 'computer_application', 'computer_equipment', 'computer_program', 'computer_program_development', 'computer_support', 'computer_system_automation', 'computer_system_implementation', 'computer_system_modernization', 'computer_tool', 'computerized_process', 'computing_solution_capacity', 'core_application', 'data_volume', 'database', 'data_center', 'device_app', 'digital_application', 'digital_platform_design', 'digital_transformation_design', 'electronic_monitoring_system', 'electronic_system', 'hardware_structure', 'health_information_system', 'health_information_system', 'human_resource_information_system', 'human_resource_software', 'implementation_management_computer_system', 'implementation_monitoring_system', 'industry_control_system', 'information_management_platform', 'information_system', 'information_system_design', 'information_system_development', 'information_system_security', 'information_system_support', 'information_technology', 'information_technology_system', 'integrated_information_system', 'interoperable_digital_service', 'inventory_information_system', 'massive_data_display', 'micro_computer', 'mission_critical_site', 'mobile_app', 'mobile_application', 'mobile_device_application', 'mobile_internet_access_service', 'mobile_phone_data', 'mobile_solution_software', 'mobile_telecommunication', 'monitoring_center', 'monitoring_system_design', 'network_connected', 'network_connection', 'network_connectivity', 'network_infrastructure', 'network_resilience', 'network_security', 'network_solution', 'off-the-shelf_computer_system', 'required_hardware', 'safe_zone', 'scada', 'secure_information_system', 'software_design', 'software_development', 'software_package', 'software_platform', 'software_type', 'software_type_intervention', 'strengthening_monitoring_system', 'telecommunication_service', 'transversal_computer_system', 'virtual_private_network', 'virtual_site', 'web_platform_development']
sistemas_list = list(set(sistemas_list))

In [None]:
df_innovation = pd.DataFrame(columns=['Short_Name', 'data_lemmatized'])
df_innovation.at[0, 'Short_Name'] = 'innovation'
df_innovation.at[0, 'data_lemmatized'] = innovacion_list
df_innovation.at[1, 'Short_Name'] = 'sistemas'
df_innovation.at[1, 'data_lemmatized'] = sistemas_list
df_innovation

# ***********************************************************************************************
<br>
<br>
<br>

# Initial analysis

In [None]:
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Vectorization

In [None]:
# Data is already tokenized in a custom way, so a dummy function is built in order to pass along what it receives
# Source: http://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/

def dummy_fun(doc):
    return doc

### CountVectorizer (without specialized documents)

In [None]:
vec_count = CountVectorizer(analyzer='word', \
                      max_df=0.60, min_df=3, # cut terms that appear in more than 60% and less than 2 documents \
                      tokenizer=dummy_fun, \
                      preprocessor=dummy_fun, \
                      token_pattern=None, \
                      max_features=None) 

In [None]:
matrix_countvectorizer = vec_count.fit_transform(data_lemmatized)
matriz_count_vect = pd.DataFrame(matrix_countvectorizer.toarray(), columns=vec_count.get_feature_names())

In [None]:
matriz_count_vect.shape

In [None]:
matriz_count_vect.head()

In [None]:
matriz_count_vect.columns[-100:]

In [None]:
df_base.iloc[matriz_count_vect[matriz_count_vect['cybersecurity'] != 0]['cybersecurity'].index]

In [None]:
for col in matriz_count_vect.columns:
    if 'cyber' in col:
        print(col)

In [None]:
for col in matriz_count_vect.columns:
    if 'software' in col:
        print(col)

In [None]:
for col in matriz_count_vect.columns:
    if 'hydro' in col:
        print(col)

Resultado: sin agregar la "base de conocimiento", mediante CountVectorizer, el contenido de interés se pierde al realizar el filtrado.
<br>
<br>

## **************************************************  **************************************************  **************************************************
<br>
<br>

### TfidfVectorizer

In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(analyzer='word',\
                                   use_idf=True, \
                                max_df=0.6, min_df=3, \
                                preprocessor=dummy_fun, \
                                tokenizer=dummy_fun, \
                                token_pattern=None) 

<br>
<br>

### Including 'knowledge base' by adding specialized documents:

In [None]:
df_innovation

In [None]:
# 02/11:
# adding innovation and systems related terms to keep:
data_full = data_lemmatized + specialized_docs + specialized_docs + \
            [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + \
            [df_innovation.data_lemmatized[1]] 
#+ [df_innovation.data_lemmatized[1]] + [df_innovation.data_lemmatized[1]]


In [None]:
len(data_full)

<br>
<br>

### CountVectorizer (including specialized documents)

In [None]:
vec_count = CountVectorizer(analyzer='word', \
                      max_df=0.60, min_df=3, # cut terms that appear in more than 60% and less than 2 documents \
                      tokenizer=dummy_fun, \
                      preprocessor=dummy_fun, \
                      token_pattern=None) 

In [None]:
matrix_countvectorizer_full = vec_count.fit_transform(data_full)
matriz_count_vect_full = pd.DataFrame(matrix_countvectorizer_full.toarray(), columns=vec_count.get_feature_names())

In [None]:
print(matriz_count_vect_full.shape)
matriz_count_vect_full.head()

In [None]:
matriz_count_vect_full.columns[-100:]

In [None]:
for col in matriz_count_vect_full.columns:
    if 'covid' in col:
        print(col)

In [None]:
for col in matriz_count_vect_full.columns:
    if 'software' in col:
        print(col)

In [None]:
for col in matriz_count_vect_full.columns:
    if '-' in col:
        print(col)

Resultado: Con la "base de conocimiento" incluida, el contenido de interés es enriquecido y así emerge más claramente.
<br>
<br>

### TfidfVectorizer (including specialized documents)

In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(use_idf=True, \
                      analyzer='word', \
                      max_df=0.60, min_df=3, # cut terms that appear in more than 60% and less than 3 documents \
                      tokenizer=dummy_fun, \
                      preprocessor=dummy_fun, \
                      token_pattern=None) #, \
                      #encoding='latin-1', \
                      #stop_words=final_stop_words) 

### Doc-Term Matrix

In [None]:
tfidf_matrix_full = tfidf_vectorizer.fit_transform(data_full) #fit the vectorizer to data_full
idf_df_full = pd.DataFrame(tfidf_matrix_full.toarray(), columns=tfidf_vectorizer.get_feature_names())
idf_df_full  # Doc-Term Matrix as dataframe

In [None]:
idf_df_full.sort_values(by='cybersecurity', ascending=False)['cybersecurity'].head(30)

## **************************************************  **************************************************  **************************************************
<br>
<br>

# Clustering

### Optimal cluster number

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.cluster import KMeans

In [None]:
#type(np.asarray(X))
X = tfidf_matrix_full
X_array = np.asarray(X.todense())

In [None]:
X.shape

##### current run: "salted" with specialized documents included multiple times

In [None]:
%%time
from sklearn import metrics
from scipy.spatial.distance import cdist

distortions = []

# run kmeans with many different k
K = range(10, 40)
for k in K:
    print('Processing with k = ', k)
    k_means = KMeans(n_clusters=k, random_state=100) #.fit(X_reduced)
    k_means.fit(X_array)
    distortions.append(sum(np.min(cdist(X_array, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])



In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method - optimal k')
plt.grid(True)
plt.savefig('clusters_elbow_distortion_feb09_60xciento_3min_English_final.png')
plt.show()

In [None]:
print()

### 20 Clusters (2021/02/11)

In [None]:
# We look at the 20 clusters generated by k-means:
k = 20
kmeans = KMeans(n_clusters=k, random_state=100)
y_fit = kmeans.fit(X)
y_pred = kmeans.predict(X)

clusters = kmeans.labels_.tolist()

In [None]:
# squared distance to cluster center
X_dist = kmeans.transform(X)**2

In [None]:
## Save the KMeans model:
pickle.dump(kmeans, open("./output/clustering_kmeans_model_20clusters_2021-02-11_60xciento_3min_ENGLISH.pkl", "wb"))

#### 20 Clusters: Terms per cluster

In [None]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(20):
    #top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    #print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
    #print()
    top_30_words = [terms[ind] for ind in order_centroids[i, :30]]
    print("Cluster {}: {}".format(i, ' '.join(top_30_words)))
    print()
    print()

#### df specialized docs

In [None]:
'''
# initial run 02/09:
# adding innovation and systems related terms to keep:
data_full = data_lemmatized + specialized_docs + specialized_docs + \
            [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + \
            [df_innovation.data_lemmatized[1]] + [df_innovation.data_lemmatized[1]] + [df_innovation.data_lemmatized[1]]
'''

In [None]:
# specialized docs
df_spec_aux = pd.concat([df_specialized, df_specialized], ignore_index=True)
df_spec_aux.insert(loc=0, column='doc_type', value='specialized')
df_spec_aux.rename(columns={'Short_Name': 'OPERATION_NUMBER'}, inplace=True)
df_spec_aux.drop(['extracted_cleaned', 'alt2_terms', 'alt2_tokens', 'alt2_data_lemmatized'], axis=1, inplace=True)
df_spec_aux

In [None]:
# adding innovation and a list of terms to keep (01/21/2021):
#data_full = data_lemmatized + specialized_docs + specialized_docs + [specialized_docs[1]] + [specialized_docs[2]] + \
#    [specialized_docs[4]] + [specialized_docs[7]] + \
#    [specialized_docs[1]] + [specialized_docs[4]] + [specialized_docs[7]] + \
#    [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + [df_innovation.data_lemmatized[0]] + \
#    [df_innovation.data_lemmatized[1]] 

In [None]:
# innovation
for i in range(36,39):
    df_spec_aux.at[i, 'doc_type'] = 'specialized'
    df_spec_aux.at[i, 'OPERATION_NUMBER'] = 'innovation'
df_spec_aux

In [None]:
# terms to keep
#for i in range(41, 44): 
for i in range(39, 40): 
    df_spec_aux.at[i, 'doc_type'] = 'specialized'
    df_spec_aux.at[i, 'OPERATION_NUMBER'] = 'sistemas'
df_spec_aux

In [None]:
df_spec_aux.shape

In [None]:
df_all = pd.concat([df_base[['doc_type', 'OPERATION_NUMBER']], df_spec_aux], ignore_index=True)
df_all.head()

In [None]:
#############

In [None]:
results_full_20_clusters = pd.DataFrame({
    'doc_type': df_all.doc_type,
    'operation': df_all.OPERATION_NUMBER,
    #'text': data_full,
    'category': kmeans.labels_
})
results_full_20_clusters

In [None]:
results_full_20_clusters.head(-40)

In [None]:
results_full_20_clusters.tail(40)

In [None]:
results_full_20_clusters.head(-40).category.value_counts()

In [None]:
results_full_20_clusters.head(-40)[results_full_20_clusters.category == 18]

In [None]:
##############################

In [None]:
# Top100 terms in Cluster 18 - Cyber
[terms[ind] for ind in order_centroids[18, :100]]

#### Merge operations with the specialized_docs and their labels

In [None]:
len(data_lemmatized)

In [None]:
# Seleccionado el modelo con 20 clusters (02/12)
results_full = pd.DataFrame({
    'doc_type': df_all.doc_type,
    'operation': df_all.OPERATION_NUMBER,
    #'text': data_full,
    'category': kmeans.labels_
})
results_full


In [None]:
results_full.category.value_counts()

#### Merge clustering results with all squared distances

In [None]:
## concatenate results full using X_dist - 20 clusters (squared distance to centroid)
result_clustering = pd.concat([results_full, pd.DataFrame(np.column_stack(list(zip(*X_dist))), columns=['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', \
                                                          'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19'])], axis=1, sort=False)
result_clustering

## **************************************************  **************************************************  **************************************************
<br>
<br>

## Cosine Similarity

In [None]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(idf_df_full, idf_df_full))

In [None]:
%%time
# Compute similarity matrix (a numpy 2D array) from the idf_ matrix.
similarity = cosine_similarity(idf_df_full)
print(similarity.shape)

# Create similarity dataframe with appropriate column names and indices.
similarity_df = pd.DataFrame(similarity,
                                columns = idf_df_full.index)
                                #index = valid_snippets_ex)
    
similarity_df

<br>
<br>
Evaluation - selected operations vs specialized documents:

In [None]:
df_innovation

In [None]:
result_clustering.tail(40)

In [None]:
#similarity_df[[1086, 1087, 1040, 1041, 1042, 1043, 1044, 1045, 1046]].head(-29)
#df_spec_aux

In [None]:
df_innovation

In [None]:
cosine_evaluation = similarity_df[[826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 862, 865]].head(-40).copy()

cosine_evaluation.rename(columns={826:df_specialized.Short_Name[0], 827:df_specialized.Short_Name[1], 828:df_specialized.Short_Name[2], \
                                  829:df_specialized.Short_Name[3], 830:df_specialized.Short_Name[4], 831:df_specialized.Short_Name[5], \
                                  832:df_specialized.Short_Name[6], 833:df_specialized.Short_Name[7], 834:df_specialized.Short_Name[8], \
                                  835:df_specialized.Short_Name[9], 836:df_specialized.Short_Name[10], 837:df_specialized.Short_Name[11], \
                                  838:df_specialized.Short_Name[12], 839:df_specialized.Short_Name[13], 840:df_specialized.Short_Name[14], \
                                  841:df_specialized.Short_Name[15], 842:df_specialized.Short_Name[16], 843:df_specialized.Short_Name[17], \
                                  862:df_innovation.Short_Name[0], 865:df_innovation.Short_Name[1]}, inplace=True)
cosine_evaluation

#### ~ ~ ~
<b> * SU-L1055</b>

In [None]:
similarity_df.iloc[df_base[df_base.OPERATION_NUMBER == 'SU-L1055'].index.values.astype(int)]

In [None]:
cosine_evaluation[611:612]

#### ~ ~ ~
<b> * UR-L1152</b>

In [None]:
cosine_evaluation.iloc[df_base[df_base.OPERATION_NUMBER == 'UR-L1152'].index.values.astype(int)]

#### ~ ~ ~
<b> * CH-L1142</b>

In [None]:
cosine_evaluation.iloc[df_base[df_base.OPERATION_NUMBER == 'CH-L1142'].index.values.astype(int)]

#### ~ ~ ~
<b> * RG-T3024</b>

In [None]:
cosine_evaluation.iloc[df_base[df_base.OPERATION_NUMBER == 'RG-T3024'].index.values.astype(int)]

#### ~ ~ ~
<b> * CO-T1496</b>

In [None]:
cosine_evaluation.iloc[df_base[df_base.OPERATION_NUMBER == 'CO-T1496'].index.values.astype(int)]

#### ~ ~ ~
<br>

In [None]:
cosine_evaluation.describe()

In [None]:
# Prepare results: 
cos_test = pd.concat([results_full[:-40], cosine_evaluation], axis=1)
cos_test

In [None]:
cos_test['category'].value_counts()

In [None]:
# Cluster Ciberseguridad
cos_test[cos_test['category'] == 18]

In [None]:
# 
cos_test[cos_test['operation'] == 'RG-T3024']

In [None]:
# Cluster Digital
cos_test[cos_test['category'] == 19]

In [None]:
### end

## **************************************************  **************************************************  **************************************************
<br>
<br>

In [None]:
idf_df_full.head()

In [None]:
print(matriz_count_vect_full.shape)
matriz_count_vect_full.head()

## **************************************************  **************************************************  **************************************************
<br>
<br>

### Term-Doc Matrix

In [None]:
term_doc_matrix = tfidf_matrix_full.todense().transpose()

In [None]:
df_term = pd.DataFrame(term_doc_matrix, 
                  columns=idf_df_full.index.to_list(), 
                  index=tfidf_vectorizer.get_feature_names()
                      )
df_term

In [None]:
### Takes forever to run the following cell. A dimension reduction might be needed

In [None]:
# Compute similarity matrix (a numpy 2D array) from the idf_ matrix.
similarity = cosine_similarity(idf_df_full)
print(similarity.shape)

# Create similarity dataframe with appropriate column names and indices.
similarity_df = pd.DataFrame(similarity,
                                columns = idf_df_full.index)
                                #index = valid_snippets_ex)
    
similarity_df

In [None]:
idf_df_full

In [None]:
%%time
# Compute similarity matrix (a numpy 2D array).
similarity_term = cosine_similarity(df_term)
print(similarity_term.shape)


In [None]:
%%time
# Create similarity dataframe with appropriate column names and indices.
similarity_term_df = pd.DataFrame(similarity_term,
                                columns = df_term.index,#)
                                index = df_term.index)

similarity_term_df

In [None]:
similarity_term_df.head(10)

In [None]:
# end

## **************************************************  **************************************************  **************************************************
<br>
<br>

### Terms selection

In [None]:
# Digital - Cluster 19
# Cybersecurity - Cluster 18
# Innovation - Cluster 15

In [None]:
# Top400 terms in Cluster 'Cybersecurity' : 18
top_400_words_ciber = [terms[ind] for ind in order_centroids[18, :1000]]
#print(top_400_words_ciber)

In [None]:
top_400_words_ciber

In [None]:
#After cleaning-up the previous list:
lista_final = ['cybersecurity', 'security', 'cyber', 'attack', 'device', 'threat', 'source', 'network', 'asset', 'action', 'control', 'incident', 'privacy', 'controls', 'information_system', 'scada', 'software', 'cybercrime', 'information_share', 'cyberspace', 'authentication', 'inventory', 'user', 'estonia', 'protocol', 'malicious', 'internet', 'cryptographic', 'server', 'national_cybersecurity', 'communication', 'vulnerability', 'cybersecurity_risk', 'scada_system', 'computer', 'medical_device', 'access_control', 'control_system', 'cybersecurity_strategy', 'capability', 'identity', 'risk_management', 'critical_infrastructure', 'digital', 'security_controls', 'firewall', 'information_security', 'configuration', 'defence', 'defense', 'application', 'cloud', 'response', 'encryption', 'national_cybersecurity_strategy', 'confidentiality', 'hardware', 'cyber_incident', 'code', 'awareness', 'packet', 'gps', 'password', 'security_measure', 'intelligence', 'asset_inventory', 'unauthorized', 'firmware', 'detection', 'environment', 'control_network', 'ip', 'media', 'disclosure', 'incident_response', 'ict', 'privacy_controls', 'trust', 'supply_chain', 'system_component', 'grid', 'resilience', 'failure', 'attack_surface', 'computing', 'risk_assessment', 'law_enforcement', 'national_security', 'attacker', 'estonian', 'cyberthreat', 'architecture', 'availability', 'wireless', 'controller', 'machine', 'industrial_control', 'technical_security_controls', 'property', 'controls_information_system', 'controls_information', 'cyber_threat', 'authorization', 'industrial_control_system', 'technical_security', 'cloud_computing', 'pii', 'identifiable_information', 'patch', 'denial', 'service_provider', 'message', 'command', 'available', 'plc', 'pmu', 'supplier', 'investigation', 'cybersecurity_policy', 'cyberattack', 'enforcement', 'cybersecurity_activity', 'identification', 'sensor', 'legislation', 'interface', 'malware', 'algorithm', 'distribution', 'supply', 'cert', 'criminal', 'security_policy', 'intrusion', 'clinical_information_system', 'online', 'version', 'transmission', 'share', 'person', 'clinical_information', 'manual', 'biometric', 'threat_intelligence', 'control_enhancement', 'prevention', 'corporate_network', 'automation', 'damage', 'cyber_defence', 'cloud_service', 'data_protection', 'mobile', 'power_grid', 'public_key', 'tactic', 'threat_actor', 'storage', 'nato', 'network_traffic', 'safety', 'system_security', 'likelihood', 'spoofing', 'csirt', 'defense', 'information_protection', 'router', 'penetration', 'advanced', 'cybersecurity_requirement', 'signal', 'patient', 'attack_scenario', 'crime', 'jam', 'functionality', 'security_requirement', 'assurance', 'clock', 'test', 'compromise', 'community', 'redundancy', 'ml', 'dmz', 'processing', 'information_technology', 'client', 'criticality', 'personal_data', 'risk_management_process', 'adversary', 'card', 'management_process', 'automatic', 'exposure', 'client_device', 'manipulation', 'attack_vector', 'platform', 'team', 'insight', 'communication_network', 'lack', 'exploit', 'timing', 'system_failure', 'prosecution', 'boundary', 'personal_information', 'issue', 'host', 'audit', 'medium', 'minister', 'record', 'notification', 'core', 'federal', 'denial_service', 'systematic', 'signature', 'content', 'item', 'remote_care', 'virtual', 'port', 'health_information', 'lexicon', 'passive', 'people', 'education', 'cryptographic_key', 'repudiation', 'phase', 'stakeholder', 'loss', 'actor', 'technological', 'security_service', 'accidental', 'theft', 'weakness', 'expert', 'hostile', 'insurance', 'type_procurement', 'barrier', 'web', 'possibility', 'clinical', 'identification_system', 'life_cycle', 'account', 'cyber_resilience', 'credential', 'partner', 'verification', 'collaboration', 'complexity', 'identifier', 'right', 'legacy', 'provision', 'classification', 'appropriate_activity', 'social_media', 'computation', 'nation', 'scan', 'operating', 'table', 'crisis', 'government_institution', 'dos', 'service_model', 'international_law', 'agent', 'compliance', 'complex', 'remote_access', 'anti', 'cyber_security', 'figure', 'outcome', 'station', 'smart', 'approach', 'secret', 'care_system', 'security_program', 'accountability', 'directive', 'office', 'member', 'means', 'process_control', 'insider', 'token', 'cryptography', 'cybersecurity_challenge', 'degree', 'email', 'certification', 'future', 'task', 'overview', 'confidence', 'guideline', 'network_attack', 'breach', 'wide', 'slave', 'programme', 'mode', 'procedural', 'senior', 'maturity', 'logical', 'specific_recommendation', 'frequency', 'accuracy', 'formal', 'intent', 'documentation', 'tolerance', 'source_code', 'computer_system', 'developer', 'acceptable', 'modern', 'rb', 'physical_access', 'course', 'engineering', 'check', 'commercial', 'problem', 'channel', 'care', 'coordination', 'range', 'physical_inspection', 'expertise', 'sectoral', 'vpn', 'administration', 'basis', 'rule', 'security_control', 'historian', 'intellectual', 'malicious_code', 'mind', 'evidence', 'actuator', 'penetration_testing', 'query', 'relay', 'network_security', 'greater', 'enhancement', 'database', 'separate', 'military', 'civil_liberty', 'glossary', 'customer', 'detail', 'nature', 'hand', 'thing', 'rfp', 'traffic_analysis', 'life', 'impact_assessment', 'mac', 'additional', 'importance', 'illicit', 'reliable', 'regulation', 'jurisdiction', 'access_controls', 'subject', 'detailed', 'perimeter', 'radio', 'installation', 'occurrence', 'major', 'planning', 'significant', 'deployment', 'scope', 'specialist', 'differential_privacy', 'security_risk', 'risk_tolerance', 'open_source', 'gsoc', 'governance', 'network_access', 'recovery', 'switch', 'unauthorized_access', 'gateway', 'public_sector', 'emergency', 'virtual_machine', 'backdoor', 'cyber_risk', 'backup', 'cyberdefence', 'internet_user', 'acquisition', 'authenticity', 'cybersecurity_incident', 'laptop', 'use_cyberspace', 'response_plan', 'methodology', 'reliability', 'bit', 'security_personnel', 'reputation', 'security_standard', 'defensive', 'exploitable', 'continuous', 'forensic', 'critical_infrastructure_protection', 'sensitive_data', 'cybersecurity_education', 'patching', 'incident_response_plan', 'cybersecurity_field', 'eavesdropping', 'physical_security', 'cybersecurity_culture', 'sensitive_information', 'antivirus', 'infrastructure_protection', 'professional_training', 'power_sector', 'supply_chain_risk', 'hmi', 'content_identification', 'online_media', 'service_user', 'chain_risk', 'modbus', 'aspect_cybersecurity', 'tcp', 'intellectual_property', 'contingency' 'contingency_plan', 'iot', 'networking', 'advanced_persistent', 'audit_record', 'physical_asset', 'decryption', 'asset_management', 'risk_management_practice', 'http', 'industrial_protocol', 'security_feature', 'network_segmentation', 'digital_signature', 'persistent_threat', 'advanced_persistent_threat', 'software_development', 'cybersecurity_community', 'authenticator', 'ip_address', 'cryptographic_algorithm', 'disk', 'factor_authentication', 'cipher', 'certification_authority', 'rbac', 'intrusion_detection', 'information_system_security', 'version_software', 'data_historian', 'virus', 'nist_special_publication', 'operating_system', 'configuration_file', 'malicious_activity', 'digital_society', 'human_resource', 'segmentation', 'organizational_asset', 'persistent', 'risk_management_program', 'configuration_management', 'topology', 'ciphertext', 'system_development', 'response_team', 'script', 'iac', 'absence', 'interconnection', 'exploitation', 'management_system', 'security_officer', 'investigative', 'cybersecurity_information', 'security_function', 'node', 'management_practice', 'security_strategy', 'communication_protocol', 'information_security_program', 'confidential', 'business_function', 'security_plan', 'boundary_protection', 'organizational_risk', 'adverse_effect', 'apt', 'government_management', 'exchange_information', 'private_key', 'website', 'security_domain', 'business_continuity', 'privacy_policy', 'segregation', 'process_level', 'critical_system', 'cybersecurity_training', 'ransomware', 'network_device', 'digital_agenda', 'network_communication', 'system_development_life_cycle', 'cybersecurity_industry', 'development_life_cycle', 'cryptographic_module', 'external_network', 'emergency_response', 'global_economy', 'cybersecurity_event', 'system_resource', 'authentication_protocol', 'identity_verification', 'power_station', 'real_time', 'communication_technology', 'exfiltration']


In [None]:
words_to_remove = ['source', 'action', 'control', 'controls', 'inventory', 'user', 'estonia', 'protocol', 'internet', 'capability', 'configuration', 'response', 'packet', 'unauthorized', 'detection', 'environment', 'media', 'disclosure', 'privacy_controls', 'trust', 'failure', 'estonian', 'availability', 'controller', 'technical_security_controls', 'property', 'controls_information_system', 'controls_information', 'authorization', 'identifiable_information', 'patch', 'denial', 'message', 'available', 'supplier', 'investigation', 'enforcement', 'identification', 'legislation', 'interface', 'distribution', 'supply', 'criminal', 'online', 'version', 'transmission', 'share', 'person', 'manual', 'prevention', 'damage', 'tactic', 'storage', 'nato', 'safety', 'likelihood', 'penetration', 'advanced', 'signal', 'patient', 'jam', 'functionality', 'assurance', 'clock', 'test', 'compromise', 'community', 'redundancy', 'processing', 'client', 'criticality', 'adversary', 'card', 'automatic', 'manipulation', 'platform', 'team', 'insight', 'lack', 'timing', 'prosecution', 'boundary', 'issue', 'host', 'medium', 'minister', 'record', 'notification', 'core', 'federal', 'systematic', 'signature', 'content', 'item', 'remote_care', 'virtual', 'port', 'health_information', 'lexicon', \
                   'passive', 'people', 'education', 'phase', 'stakeholder', 'loss', 'actor', 'technological', 'accidental', 'theft', 'weakness', 'expert', 'hostile', 'insurance', 'type_procurement', 'barrier', 'possibility', 'clinical', 'account', 'credential', 'partner', 'verification', 'collaboration', 'complexity', 'identifier', 'right', 'provision', 'classification', 'appropriate_activity', 'nation', 'operating', 'table', 'crisis', 'government_institution', 'dos', 'international_law', 'agent', 'complex', 'anti', 'figure', 'outcome', 'station', 'smart', 'approach', 'secret', 'care_system', 'accountability', 'directive', 'office', 'member', 'means', 'process_control', 'insider', 'token', 'degree', 'certification', 'future', 'task', 'overview', 'confidence', 'guideline', 'wide', 'slave', 'programme', 'mode', 'procedural', 'senior', 'maturity', 'logical', 'specific_recommendation', 'frequency', 'accuracy', 'formal', 'intent', 'documentation', 'tolerance', 'acceptable', 'modern', 'rb', 'physical_access', 'course', 'engineering', 'check', 'commercial', 'problem', 'channel', 'care', 'coordination', 'range', 'physical_inspection', 'expertise', 'sectoral', 'administration', 'basis', 'rule', 'intellectual', 'mind', 'evidence', 'actuator', 'query', 'relay', 'greater', \
                   'enhancement', 'separate', 'military', 'civil_liberty', 'glossary', 'customer', 'detail', 'nature', 'hand', 'thing', 'life', 'impact_assessment', 'mac', 'additional', 'importance', 'illicit', 'reliable', 'regulation', 'jurisdiction', 'subject', 'detailed', 'perimeter', 'radio', 'installation', 'occurrence', 'major', 'planning', 'significant', 'deployment', 'scope', 'specialist', 'differential_privacy', 'gsoc', 'governance', 'recovery', 'switch', 'gateway', 'public_sector', 'emergency', 'backup', 'internet_user', 'acquisition', 'authenticity', 'response_plan', 'methodology', 'reliability', 'bit', 'reputation', 'security_standard', 'defensive', 'exploitable', 'continuous', 'antivirus', 'professional_training', 'power_sector', 'content_identification', 'service_user', 'contingencycontingency_plan', 'advanced_persistent', 'disk', 'factor_authentication', 'rbac', 'virus', 'nist_special_publication', 'human_resource', 'segmentation', 'organizational_asset', 'persistent', 'risk_management_program', 'topology', 'script', 'iac', 'absence', 'interconnection', 'investigative', 'node', 'management_practice', 'confidential', 'business_function', 'boundary_protection', 'organizational_risk', 'adverse_effect', 'government_management', 'segregation', 'process_level', \
                   'development_life_cycle', 'cryptographic_module', 'external_network', 'global_economy', 'power_station']
                  

words_to_remove = list(set(words_to_remove))

In [None]:
words_to_test = [word for word in lista_final if word not in words_to_remove]
words_to_test.append('scada')
words_to_test.append('scada_system')
sorted(words_to_test)

In [None]:
innovacion_list

In [None]:
sistemas_list

In [None]:
words_to_test = list(sorted(set((words_to_test + sistemas_list + innovacion_list))))
words_to_test[:30]

In [None]:
# clean words_to test list:
words_to_test_curated = []
for word in words_to_test:
    for column in idf_df_full.columns:
        if word == column:
            words_to_test_curated.append(word)

len(words_to_test_curated)

In [None]:
#idf_df_full[idf_df_full['scada'] > 0]['scada']
idf_df_full[idf_df_full['machine_learning'] > 0]['machine_learning']

In [None]:
print(words_to_test_curated)

In [None]:
len(words_to_test_curated)

In [None]:
# 01/21
#words_limpieza = ['activo_organización', 'alcance_auditoría', 'alteración', 'ambiente', 'analysis', 'based_on', 'capacidad_prevención', 'caracterización', 'componente_crítico', 'configuration', 'consenso', 'control_técnico_seguridad_calidad', 'criterio_auditoría', 'crítico', 'cumplimiento_procedimiento', 'datar_portability', 'declaración_requisito', 'deficiencia', 'difusión', 'ecosistema', 'ejercicio', 'enlace', 'entrega_servicio', 'estrategia_gestión', 'infrastructure_capabilitie', 'institución_público', 'jornada', 'manage', 'mando',  'motivación', 'nación_unido', 'necesidad_empresarial', 'nivel_servicio', 'objetivo_negocio', 'obligación', 'obtención', 'origen', 'pago', 'perímetro', 'platform_capabilitie', 'posición', 'propagación', 'práctica_seguridad', 'query', 'receptor', 'requisito_empresarial', 'resource_are_controlled', 'smart', 'structured', 'talento', 'violación', 'vista']
#words_to_test_curated = [word for word in words_to_test_curated if word not in words_limpieza]
#len(words_to_test_curated)

In [None]:
print((words_to_test_curated))

In [None]:
# df test: get the columns from the idf_df_full Doc-Term Matrix dataframe
df_test = idf_df_full[words_to_test_curated].copy()
print(df_test.columns)
df_test

### Ajuste c/ponderacion (01/18)

In [None]:
# read xlsx with words: 
df_ponderacion = pd.read_excel('./input/Ponderación Términos Cyber-EN.xlsx', sheet_name = 'Sheet1')
df_ponderacion.head()

In [None]:
pond_innovacion = df_ponderacion['Innovation'].apply(str).str.strip().to_list()
pond_cyber01 = df_ponderacion['Cyber Group1'].apply(str).str.strip().to_list()
pond_cyber02 = df_ponderacion['Cyber Group2'].apply(str).str.strip().to_list()
pond_cyber03 = df_ponderacion['Cyber Group3'].apply(str).str.strip().to_list()

In [None]:
len(pond_innovacion) + len(pond_cyber01) + len(pond_cyber02)+ len(pond_cyber03)
len(set(pond_innovacion + pond_cyber01 + pond_cyber02 + pond_cyber03))

In [None]:
pond_cyber03

In [None]:
df_test.columns

In [None]:
df_test.shape

In [None]:
'machine_learning'in df_test.columns

In [None]:
matches = []
matches_inno = []
to_review = []
for index, row in df_test.iterrows():
    count = 0
    matching_col = []
    matching_col_inno = []
    ##
    cyber_pond = []  #lista para almacenar valores ponderados de cada match de cada operacion
    inno_pond = []
    ##
    for j in range(df_test.shape[1]):
        if df_test.iloc[index,j] > 0.00001: 
            # ajuste para ponderar por pesos
            
            #### !!!!
            df_test.iloc[index,j] = df_test.iloc[index,j] + 1
            count = count + 1
            matching_col.append(df_test.columns[j]) #store the matched term
            
            # inicializacion:
            inno_pond_value = 0
            cyber_pond_value = 0
            
            # ponderacion:
            if df_test.columns[j] in pond_innovacion: # si esta en innovacion, la almaceno
                inno_pond_value = df_test.iloc[index,j]
                matching_col_inno.append(df_test.columns[j])
                
            elif df_test.columns[j] in pond_cyber01:
                cyber_pond_value = 0.1 * df_test.iloc[index,j] # ponderacion al 10% 
            elif df_test.columns[j] in pond_cyber02:
                cyber_pond_value = 0.4 * df_test.iloc[index,j] # ponderacion al 40% 
            elif df_test.columns[j] in pond_cyber03:
                cyber_pond_value = df_test.iloc[index,j] # ponderacion al 100%
                
            else:
                cyber_pond_value = 0.5 * df_test.iloc[index,j] # ponderacion al 50% y avisar que no está en las listas
                print('Not found in ponderations lists:', df_test.columns[j])
                to_review.append(df_test.columns[j])
            
            cyber_pond.append(cyber_pond_value)
            inno_pond.append(inno_pond_value)
    
    cyb_sum = sum(cyber_pond)
    inno_sum = sum(inno_pond)
    print(matching_col, str(index), 'cyber_pond:', str(cyb_sum), 'inno_pond:', str(inno_sum))
    
    matches.append(matching_col)
    matches_inno.append(matching_col_inno)
    
    df_test.at[index, 'count_findings'] = count
    ####
    df_test.at[index, 'pond_cyber'] = cyb_sum
    df_test.at[index, 'pond_innovation'] = inno_sum
    ####

#Total sum per row: 
#df_test.loc[:,'total'] = df_test.sum(axis=1)
df_test['total'] = df_test.apply(lambda col: col['3-d_printing':'wireless'].sum(),axis=1)

df_test['matches_cyber'] = pd.Series(matches)
df_test['matches_innovation'] = pd.Series(matches_inno)

print('terminos para revisar porque no estan en ninguna lista:')
set(to_review)

In [None]:
len(set(to_review))

In [None]:
df_test.tail(15)

In [None]:
#########

In [None]:
# Digital - Cluster 19
# Cybersecurity - Cluster 18
# Innovation - 

df_test['d_cluster_digital'] = X_dist[:,19]
df_test['d_cluster_ciber'] = X_dist[:,18]
#df_test['d_cluster_innovation'] = X_dist[:,17]



In [None]:
cos_test

In [None]:
df_test.matches_cyber[865]

In [None]:
df_test

In [None]:
df_test.matches_innovation[862]

In [None]:
df_test_aux = df_test[df_test.index.isin(cos_test.index)]
df_test_aux 

In [None]:
df_test_aux.columns

In [None]:
df_test_aux = pd.concat([cos_test,df_test_aux[['d_cluster_digital', 'd_cluster_ciber',  \
                                               'matches_cyber', 'pond_cyber', 'count_findings', 'total', 'matches_innovation', 'pond_innovation']]], axis=1)
df_test_aux

In [None]:
df_specialized

In [None]:
cos_test.columns

In [None]:
# store all results that will feed the analysis
result_clustering_total = pd.concat([result_clustering.head(-40), cos_test[['fwrk_crit_infra_nist', 'report_bid_oas', 'guide_800_53_nist', \
    'lexicon_nist', 'guide_power_enisa', 'guide_ics_mgmt_inventory_incibe', 'guide_hospitals_enisa', 'report_ics_enisa', 'nat_strat_spain', \
    'nat_strat_estonia', 'guide_800_82_nist', 'lexicon_ics2', 'lexicon_fsb', 'report_info_sharing_wef', 'dig_business', 'dig_workgrp_1', \
    'dig_govmnt_assesmnt', 'dig_workgrp_2', 'innovation', 'sistemas']], \
                                     df_test_aux[['d_cluster_digital', 'd_cluster_ciber', 'matches_cyber', 'pond_cyber', 'count_findings', \
                    'total', 'matches_innovation', 'pond_innovation']]], axis=1)

In [None]:
result_clustering_total

In [None]:
cos_test.columns

In [None]:
# words_cluster_cyber_in_docs
df_test_aux_2 = pd.concat([cos_test[['doc_type', 'operation', 'category']],df_test[df_test.index.isin(cos_test.index)]], axis=1)
df_test_aux_2

# **************************************************************************************************************** #
<br>
<br>
<br>

### Store results

In [None]:
output_file_name = './output/clustering_results_english_2021-02-16' + '.xlsx' # file name
## Output to new Excel containing each test on a different sheet

with pd.ExcelWriter(output_file_name) as writer:
    result_clustering_total.to_excel(writer, sheet_name='clustering_results')
    df_test_aux_2.to_excel(writer, sheet_name='words_cluster_cyber_in_docs')
    cos_test.to_excel(writer, sheet_name='similarity_specialized_docs')
    similarity_df.to_excel(writer, sheet_name='similarity_all_docs')
   

In [None]:
# store all results:
df_result_clustering = 'result_clustering_english_2021-02-16.joblib'
joblib.dump(result_clustering_total, './output/' + df_result_clustering + '.bz2', compress=('bz2', 3))  # clustering_results

df_cos_test = 'similarity_specialized_docs_english_2021-02-16.joblib'
joblib.dump(cos_test, './output/' + df_cos_test + '.bz2', compress=('bz2', 3))   # similarity_specialized_docs

df_similarity_df = 'similarity_all_docs_english_2021-02-16.joblib'
joblib.dump(similarity_df, './output/' + df_similarity_df + '.bz2', compress=('bz2', 3))   # similarity_all_docs

df_words_cluster_cyber = 'words_cluster_cyber_in_docs_english_2021-02-16.joblib'
joblib.dump(df_test_aux_2, './output/' + df_words_cluster_cyber + '.bz2', compress=('bz2', 3))   # words_cluster_cyber_in_docs


In [None]:
df_base.columns

In [None]:
df_base[['doc_type', 'language', 'OPERATION_NUMBER', 'DOCUMENT_REFERENCE', 'extracted']].to_excel('./output/operaciones_english_2021-02-16.xlsx')

In [None]:
#df_similarity_term_df = 'similarity_terms_spanish_2021-01-18.joblib'
#joblib.dump(similarity_term_df, './output/' + df_similarity_term_df + '.bz2', compress=('bz2', 3))   # similiarity_term_df

In [None]:
#####

### **************************************************************************************************************** #
<br>

# **************************************************************************************************************** #
<br>
<br>
<br>

## Euclidean Distance

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# Euclidean distance between centroids:
euclidean_distances(kmeans.cluster_centers_)

In [None]:
# distance to cluster center
X_dist_base = kmeans.transform(X)

In [None]:
print(X_dist_base[0])
print()
print(X_dist_base[1])

In [None]:
euclidean_distances([X_dist_base[0]], [X_dist_base[1]])

In [None]:
euclidean_distances([X_dist_base[0]], [X_dist_base[3]])

In [None]:
euclidean_distances([X_dist_base[1066]], [X_dist_base[0]])

## **************************************************  **************************************************  **************************************************
<br>
<br>

## Topic Modeling

In [None]:
import gensim
import gensim.corpora as corpora
#from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

In [None]:
df_cluster_lemmas = pd.concat([result_clustering.head(-40), df_base[['alt2_data_lemmatized']]], axis=1)
df_cluster_lemmas.head()

In [None]:
# find optimal number of topics, w.r.t. topic coherence

def compute_coherence_values(id2word, corpus, texts,  
                             k_start_val=2, k_end_val=18, step=2):
    """
    Compute c_v coherence for various numbers of topics

    Parameters:
    ----------
    id2word : Gensim dictionary.id2word
    corpus : Gensim corpus
    texts : List of input texts
    k_start_val: min num of topics
    k_end_val : Max num of topics
    step: the gap between one number of topics and another

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(k_start_val, k_end_val, step):
        print("\t *Building an lda model for number of topics = ", num_topics)
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           #alpha='symmetric',
                                           alpha='auto',
                                           minimum_probability=0,
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, 
                                        dictionary=id2word, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

## **************************************************  **************************************************  **************************************************
<br>
<br>

#### Topic Modeling on every cluster:

In [None]:
pprint(df_cluster_lemmas.head())

In [None]:
def gensim_dict(data, low_filter=2, high_filter=0.60):
    '''
    Returns a Gensim's id2word and filtered dictionary and a corpus
    @ author: emilianoco
    Version:
        - v0.1 - (11/19/2020)
    '''   
    # Create a dictionary representation of the documents.
    id2word = corpora.Dictionary(data)
    print('\t *Number of unique words in initital documents:', len(id2word))

    # Filter out words that occur less than 2 documents, or more than 60% of the documents.
    id2word.filter_extremes(no_below=low_filter, no_above=high_filter)

    #
    print('\t *Number of unique words after removing common words:', len(id2word))
    
    # Create a bag-of-words (BOW) Corpus
    #(Vectorize data, bag-of-words representation of each doc.)
    corpus = [id2word.doc2bow(text) for text in data]    
    
    print('\t *Number of documents: %d' % len(corpus))
    
    return id2word, corpus

In [None]:
df_topic_modeling = pd.read_excel('./output/LDA_analisis.xlsx', sheet_name='cluster_topics')
df_topic_modeling = df_topic_modeling.head(20).copy()
df_topic_modeling = df_topic_modeling.astype(object)
df_topic_modeling

In [None]:
df_cluster_topics = pd.concat([df_cluster_lemmas, pd.DataFrame(columns=['Dominant_Topic', 'Perc_Contrib', 'Topic_Keywords', '2nd_Topic',
       '2nd_Contrib'])])
df_cluster_topics.head()

In [None]:
df_cluster_topics['Dominant_Topic'] = df_cluster_topics['Dominant_Topic'].astype(object)
df_cluster_topics['Perc_Contrib'] = df_cluster_topics['Perc_Contrib'].astype(object)
df_cluster_topics['Topic_Keywords'] = df_cluster_topics['Topic_Keywords'].astype(object)
df_cluster_topics['2nd_Topic'] = df_cluster_topics['2nd_Topic'].astype(object)
df_cluster_topics['2nd_Contrib'] = df_cluster_topics['2nd_Contrib'].astype(object)
df_cluster_topics

In [None]:
%%time
for i in range(20):
    print('Processing cluster:', i)
    
    # get the operations' lemmas for the selected cluster:
    clusterx_data_lemmatized = df_cluster_lemmas[df_cluster_lemmas['category'] == i]['alt2_data_lemmatized'].tolist()
    
    # get the list of original indexes for the selected cluster:
    original_index = list(df_cluster_lemmas[df_cluster_lemmas['category'] == i].index)
    
    # create gensim dictionary: 
    id2wordx, corpusx = gensim_dict(data=clusterx_data_lemmatized, low_filter=2, high_filter=0.60 )
    
    print()
    # run topic modeling algorithm from 1 to 4 topics:
    model_listx, coherence_values = compute_coherence_values(id2word=id2wordx, 
            corpus=corpusx, texts=clusterx_data_lemmatized, k_start_val=1, 
            k_end_val=5, step=1)
    
    # store list of coherence values:
    df_topic_modeling.at[i, 'coherence_values'] = coherence_values
    
    # find the maximun coherence value:
    max_value = max(coherence_values)
    max_index = coherence_values.index(max_value)
    
    # store max_index:
    df_topic_modeling.at[i, 'max_coherence_index'] = max_index
    
    # save selected model to disk:
    file_name = "./output/cluster_" + str(i) + "_lda_" + str(max_index+1) + ".topic"
    model_listx[max_index].save(file_name)
    
    # store name of saved model:
    df_topic_modeling.at[i, 'model_selected'] = file_name
    
    # store top30 words per each topic:
    df_topic_modeling.at[i, 'topics'] = model_listx[max_index].print_topics(num_words=30)

    print()
    # generate graph:
    limit=5; start=1; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.title("Topic Modeling for Cluster: " + str(i) )
    graph_name = "./output/final_Topic_Modeling-Coherence-ENGLISH_cluster_" + str(i) + ".png"
    plt.savefig(graph_name)
    
    # save graph to disk:
    df_topic_modeling.at[i, 'graph'] = graph_name
    
    # show graph:
    plt.show()
    print()
   
    # calculate dominant topics and contribution:
    if max_index > 0: # the cluster has more than 1 topic
        dominant_topics_df_x = find_dominant_topics(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        dominant_topics_df_x['2nd_Topic'] = dominant_topics_df_x['2nd_Topic'].astype(int)
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = dominant_topics_df_x['2nd_Topic'][index]
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = dominant_topics_df_x['2nd_Contrib'][index]

    else: # the cluster has 1 topic only
        print('The cluster', i, 'has only one topic!')
    
    

    print('#####')
    print()
    

In [None]:
corpus = ''

In [None]:
def find_dominant_topics(ldamodel=None, corpus=corpus):

    # Init output
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(corpus)):
        topic_computation = sorted(ldamodel.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True)[:3]
        topic_keywords = ", ".join([word for word, prop in ldamodel.show_topic(topic_computation[0][0])])
        sent_topics_df = sent_topics_df.append(
            pd.Series([topic_computation[0][0], round(topic_computation[0][1]*100,2), \
                       topic_keywords, \
                      topic_computation[1][0], round(topic_computation[1][1]*100,2), \
                      #topic_computation[2][0], round(topic_computation[2][1]*100,2) \
                      ]), ignore_index=True)
        
    
    #sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contrib', 'Topic_Keywords', '2nd_Topic', '2nd_Contrib', '3rd_Topic', '3rd_Contrib']
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contrib', 'Topic_Keywords', '2nd_Topic', '2nd_Contrib']
    return(sent_topics_df)

In [None]:
def find_dominant_topics_one_topic(ldamodel=None, corpus=corpus):

    # Init output
    sent_topics_df = pd.DataFrame()
    
    for i in range(len(corpus)):
        topic_computation = sorted(ldamodel.get_document_topics(corpus[i]), key=lambda x: x[1], reverse=True)[:3]
        topic_keywords = ", ".join([word for word, prop in ldamodel.show_topic(topic_computation[0][0])])
        sent_topics_df = sent_topics_df.append(
            pd.Series([topic_computation[0][0], round(topic_computation[0][1]*100,2), \
                       topic_keywords, \
                      #topic_computation[1][0], round(topic_computation[1][1]*100,2), \
                      #topic_computation[2][0], round(topic_computation[2][1]*100,2) \
                      ]), ignore_index=True)
        
    
    #sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contrib', 'Topic_Keywords', '2nd_Topic', '2nd_Contrib', '3rd_Topic', '3rd_Contrib']
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contrib', 'Topic_Keywords']
    return(sent_topics_df)

In [None]:
%%time
for i in range(20):
    print('Processing cluster:', i)
    
    # get the operations' lemmas for the selected cluster:
    clusterx_data_lemmatized = df_cluster_lemmas[df_cluster_lemmas['category'] == i]['alt2_data_lemmatized'].tolist()
    
    # get the list of original indexes for the selected cluster:
    original_index = list(df_cluster_lemmas[df_cluster_lemmas['category'] == i].index)
    
    # create gensim dictionary: 
    id2wordx, corpusx = gensim_dict(data=clusterx_data_lemmatized, low_filter=2, high_filter=0.60 )
    
    print()
    # run topic modeling algorithm from 1 to 4 topics:
    model_listx, coherence_values = compute_coherence_values(id2word=id2wordx, 
            corpus=corpusx, texts=clusterx_data_lemmatized, k_start_val=1, 
            k_end_val=5, step=1)
    
    # store list of coherence values:
    df_topic_modeling.at[i, 'coherence_values'] = coherence_values
    
    # find the maximun coherence value:
    max_value = max(coherence_values)
    max_index = coherence_values.index(max_value)
    
    # store max_index:
    df_topic_modeling.at[i, 'max_coherence_index'] = max_index
    
    # save selected model to disk:
    file_name = "./output/final_ENGLISH_cluster_" + str(i) + "_lda_" + str(max_index+1) + ".topic"
    model_listx[max_index].save(file_name)
    
    # store name of saved model:
    df_topic_modeling.at[i, 'model_selected'] = file_name
    
    # store top30 words per each topic:
    df_topic_modeling.at[i, 'topics'] = model_listx[max_index].print_topics(num_words=30)

    print()
    # generate graph:
    limit=5; start=1; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.title("Topic Modeling for Cluster: " + str(i) )
    graph_name = "./output/final_ENGLISH_Topic_Modeling-Coherence-cluster_" + str(i) + ".png"
    plt.savefig(graph_name)
    
    # save graph to disk:
    df_topic_modeling.at[i, 'graph'] = graph_name
    
    # show graph:
    plt.show()
    print()
   
    # calculate dominant topics and contribution:
    if max_index > 0: # the cluster has more than 1 topic
        dominant_topics_df_x = find_dominant_topics(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        dominant_topics_df_x['2nd_Topic'] = dominant_topics_df_x['2nd_Topic'].astype(int)
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = dominant_topics_df_x['2nd_Topic'][index]
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = dominant_topics_df_x['2nd_Contrib'][index]

    else: # the cluster has 1 topic only
        print('The cluster', i, 'has only one topic!')
        dominant_topics_df_x = find_dominant_topics_one_topic(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = 'na'
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = 'na'
    
    

    print('#####')
    print()
    

In [None]:
%%time
for i in [19]:
    print('Processing cluster:', i)
    
    # get the operations' lemmas for the selected cluster:
    clusterx_data_lemmatized = df_cluster_lemmas[df_cluster_lemmas['category'] == i]['alt2_data_lemmatized'].tolist()
    
    # get the list of original indexes for the selected cluster:
    original_index = list(df_cluster_lemmas[df_cluster_lemmas['category'] == i].index)
    
    # create gensim dictionary: 
    id2wordx, corpusx = gensim_dict(data=clusterx_data_lemmatized, low_filter=2, high_filter=0.60 )
    
    print()
    # run topic modeling algorithm from 1 to 4 topics:
    model_listx, coherence_values = compute_coherence_values(id2word=id2wordx, 
            corpus=corpusx, texts=clusterx_data_lemmatized, k_start_val=1, 
            k_end_val=5, step=1)
    
    # store list of coherence values:
    df_topic_modeling.at[i, 'coherence_values'] = coherence_values
    
    # find the maximun coherence value:
    max_value = max(coherence_values)
    max_index = coherence_values.index(max_value)
    
    # store max_index:
    df_topic_modeling.at[i, 'max_coherence_index'] = max_index
    
    # save selected model to disk:
    file_name = "./output/final_ENGLISH_cluster_" + str(i) + "_lda_" + str(max_index+1) + ".topic"
    model_listx[max_index].save(file_name)
    
    # store name of saved model:
    df_topic_modeling.at[i, 'model_selected'] = file_name
    
    # store top30 words per each topic:
    df_topic_modeling.at[i, 'topics'] = model_listx[max_index].print_topics(num_words=30)

    print()
    # generate graph:
    limit=5; start=1; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.title("Topic Modeling for Cluster: " + str(i) )
    graph_name = "./output/final_ENGLISH_Topic_Modeling-Coherence-cluster_" + str(i) + ".png"
    plt.savefig(graph_name)
    
    # save graph to disk:
    df_topic_modeling.at[i, 'graph'] = graph_name
    
    # show graph:
    plt.show()
    print()
   
    # calculate dominant topics and contribution:
    if max_index > 0: # the cluster has more than 1 topic
        dominant_topics_df_x = find_dominant_topics(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        dominant_topics_df_x['2nd_Topic'] = dominant_topics_df_x['2nd_Topic'].astype(int)
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = dominant_topics_df_x['2nd_Topic'][index]
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = dominant_topics_df_x['2nd_Contrib'][index]

    else: # the cluster has 1 topic only
        print('The cluster', i, 'has only one topic!')
        dominant_topics_df_x = find_dominant_topics_one_topic(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = 'na'
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = 'na'
    
    

    print('#####')
    print()
    

In [None]:
%%time

# Cluster Cyber, w/only 2 documents

for i in [18]:
    print('Processing cluster:', i)
    
    # get the operations' lemmas for the selected cluster:
    clusterx_data_lemmatized = df_cluster_lemmas[df_cluster_lemmas['category'] == i]['alt2_data_lemmatized'].tolist()
    
    # get the list of original indexes for the selected cluster:
    original_index = list(df_cluster_lemmas[df_cluster_lemmas['category'] == i].index)
    
    # create gensim dictionary: 
    id2wordx, corpusx = gensim_dict(data=clusterx_data_lemmatized, low_filter=1, high_filter=0.999 )
    
    print()
    # run topic modeling algorithm from 1 to 4 topics:
    model_listx, coherence_values = compute_coherence_values(id2word=id2wordx, 
            corpus=corpusx, texts=clusterx_data_lemmatized, k_start_val=1, 
            k_end_val=5, step=1)
    
    # store list of coherence values:
    df_topic_modeling.at[i, 'coherence_values'] = coherence_values
    
    # find the maximun coherence value:
    max_value = max(coherence_values)
    max_index = coherence_values.index(max_value)
    
    # store max_index:
    df_topic_modeling.at[i, 'max_coherence_index'] = max_index
    
    # save selected model to disk:
    file_name = "./output/final_ENGLISH_cluster_" + str(i) + "_lda_" + str(max_index+1) + ".topic"
    model_listx[max_index].save(file_name)
    
    # store name of saved model:
    df_topic_modeling.at[i, 'model_selected'] = file_name
    
    # store top30 words per each topic:
    df_topic_modeling.at[i, 'topics'] = model_listx[max_index].print_topics(num_words=30)

    print()
    # generate graph:
    limit=5; start=1; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.title("Topic Modeling for Cluster: " + str(i) )
    graph_name = "./output/final_ENGLISH_Topic_Modeling-Coherence-cluster_" + str(i) + ".png"
    plt.savefig(graph_name)
    
    # save graph to disk:
    df_topic_modeling.at[i, 'graph'] = graph_name
    
    # show graph:
    plt.show()
    print()
   
    # calculate dominant topics and contribution:
    if max_index > 0: # the cluster has more than 1 topic
#        dominant_topics_df_x = find_dominant_topics(ldamodel=model_listx[max_index], corpus=corpusx)
#        dominant_topics_df_x
#    
#        # adjust topic number to int:
#        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
#        dominant_topics_df_x['2nd_Topic'] = dominant_topics_df_x['2nd_Topic'].astype(int)
#        
#        # append results to the original dataframe:
#        for index, row in dominant_topics_df_x.iterrows():
#            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
#            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
#            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
#            df_cluster_topics.at[original_index[index], '2nd_Topic'] = dominant_topics_df_x['2nd_Topic'][index]
#            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = dominant_topics_df_x['2nd_Contrib'][index]
#
    #else: # the cluster has 1 topic only
        print('The cluster', i, 'has only one topic!')
        dominant_topics_df_x = find_dominant_topics_one_topic(ldamodel=model_listx[max_index], corpus=corpusx)
        dominant_topics_df_x
    
        # adjust topic number to int:
        dominant_topics_df_x['Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'].astype(int)
        
        
        # append results to the original dataframe:
        for index, row in dominant_topics_df_x.iterrows():
            df_cluster_topics.at[original_index[index], 'Dominant_Topic'] = dominant_topics_df_x['Dominant_Topic'][index]
            df_cluster_topics.at[original_index[index], 'Perc_Contrib'] = dominant_topics_df_x['Perc_Contrib'][index]
            df_cluster_topics.at[original_index[index], 'Topic_Keywords'] = dominant_topics_df_x['Topic_Keywords'][index]
            df_cluster_topics.at[original_index[index], '2nd_Topic'] = 'na'
            df_cluster_topics.at[original_index[index], '2nd_Contrib'] = 'na'
    
    

    print('#####')
    print()

In [None]:
df_topic_modeling

In [None]:
df_cluster_topics['category'] = df_cluster_topics['category'].astype(int)

In [None]:
df_cluster_topics.head()

In [None]:
output_file_name = './output/clustering_topic_modeling_english_2021-02-16' + '.xlsx' # file name
## Output to new Excel containing each test on a different sheet

with pd.ExcelWriter(output_file_name) as writer:
    df_cluster_topics.to_excel(writer, sheet_name='cluster_topics_new')
    df_topic_modeling.to_excel(writer, sheet_name='topic_modeling_decisions_new')
    

In [None]:
# store all results:
df_df_cluster_topics = 'df_cluster_topics_english_2021-02-16.joblib'
joblib.dump(df_cluster_topics, './output/' + df_df_cluster_topics + '.bz2', compress=('bz2', 3))  # clustering_results

df_df_topic_modeling = 'df_topic_modeling_decisions_english_2021-02-16.joblib'
joblib.dump(df_topic_modeling, './output/' + df_df_topic_modeling + '.bz2', compress=('bz2', 3))   # topic model decisions

In [None]:
######## FIN - latest version - 02/16/2021 #########

In [None]:
#'''
# **************************************************************************************************************** #
# ********************************************  Version Control  ************************************************* #
# **************************************************************************************************************** #
  
#   Version:            Date:                User:                   Change:                                       

#   - 0.8            02/16/2021         Emiliano Colina      - Latest version English     
#   - 0.6            01/16/2021         Emiliano Colina      - Latest version - Spanish forked    
#                                                        

#
# **************************************************************************************************************** #
#'''