<a id = "2"></a><br>
# <font color="green"><u> II. Importing/Loading & checking the data:</u></font>

In [None]:
%%capture
!pip install bertopic
!pip install minisom

In [None]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 

In [None]:
import pandas as pd                                        #Data processing, CSV files I/O (e.g. pd.read_csv)
import numpy as np                                         #Linear Algebra: Matrices ...
import matplotlib.pyplot as plt                            #Data Visualisation
import seaborn as sns    
from bertopic import BERTopic

from tqdm import tqdm
# I discoverd that it's possible to download models for the specific purpose to preprocess scientific texts
# In the spacy docs I found a specific model for this : https://spacy.io/universe/project/scispacy
#Downloading en_core_sci_lg model to preprocess abstracts
from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz

In [None]:
#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_sci_lg  # import downlaoded model
import string
from minisom import MiniSom  
from sklearn.cluster import SpectralClustering 
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df =  pd.read_csv("/kaggle/input/research-papers-dataset/dblp-v10.csv")
df.head(5)

In [None]:
df.info()

In [None]:
df.isna().sum()

<a id = "3"></a><br>
# <font color="green"><u> III. Data Cleaning:</u></font>

In [None]:
df.dropna(subset='abstract',inplace=True)

In [None]:
df.isna().sum()

<a id = "4"></a><br>
 # <font color="green"><u> IV. NLP data preprocessing:</u></font>

In [None]:
# Parser
parser = en_core_sci_lg.load()
parser.max_length = 7000000 #Limit the size of the parser

def spacy_tokenizer(sentence):
    ''' Function to preprocess text of scientific papers 
        (e.g Removing Stopword and puntuations)'''
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] #remove stopsword an punctuation
    mytokens = " ".join([i for i in mytokens]) 
    return mytokens

In [None]:
punctuations = string.punctuation #list of punctuation to remove from text
stopwords = list(STOP_WORDS)
stopwords[:10]

In [None]:
# the dataframe contains still hugh amount of data. The process the data faster I reduce the df to 10000 rows
# The scope of the notebook is not to analyze all data
df = df.sample(10000, random_state=42)

In [None]:
tqdm.pandas()
df["processed_text"] = df["abstract"].progress_apply(spacy_tokenizer)

<a id = "5"></a><br>
# <font color="green"><u> V. Model Training: BERTopic:</u></font>

To train our BERTopic model, we make a few adjustments to the default parameters while ensuring originality.

Firstly, we select the embedding model "paraphrase-MiniLM-L6-v2" as our preferred choice. This particular embedding model, which can be accessed from the provided link, strikes a balance between performance and speed, making it an excellent option for sentence transformation.

Furthermore, we set the minimum topic size to 50. This parameter determines the smallest allowable size for each topic. By imposing this restriction, we aim to limit the number of generated topics. For instance, if the minimum were set to 10, a significantly larger number of topics would be created, but they might be of lesser significance. In order to prioritize substantial topics, we opt for a minimum size of 10.

In [None]:
topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L6-v2", min_topic_size=50)
topics, _ = topic_model.fit_transform(df["processed_text"].to_numpy()); len(topic_model.get_topic_info())

<a id = "6"></a><br>
> # <font color="green"><u> A. Topic Representation:</u></font>

In [None]:
topic_model.get_topic_info().head(10)

In [None]:
topic_model.visualize_barchart(top_n_topics=9, height=700)

In [None]:
topic_model.visualize_term_rank()

<a id = "7"></a><br>
> # <font color="green"><u> B. Topic Relationships:</u></font>

In [None]:
topic_model.visualize_topics(top_n_topics=21)

In [None]:
topic_model.visualize_hierarchy(top_n_topics=21, width=800)

In [None]:
topic_model.visualize_heatmap(n_clusters=5, top_n_topics=21)

<a id = "8"></a><br>
> # <font color="green"><u> C. Topics over Time:</u></font>

In [None]:
# year = df.year.astype(np.int64).tolist()
# year = list(set(year))
# topics_over_time = topic_model.topics_over_time(df.abstract, topics,year )

In [None]:
# topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, width=900, height=500)

<a id = "9"></a><br>
# <font color="green"><u> V. Clustering using scikit-learn: Uncovering Patterns in Data:</u></font>

<a id = "9"></a><br>
># <font color="green"><u> A. Vectorization of the abstracts and dimensionality reduction with PCA:</u></font>
 

In [None]:
# define vec function
def vectorize(text, maxx_features):
    
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [None]:
#vectorize each processed abstract
text = df['processed_text'].values
X = vectorize(text, 2 ** 12) #arbitrary max feature -_> Hyperpara. for optimisation (?)
X.shape

In [None]:
pca = PCA(n_components=0.95, random_state=42) #Keep 95% of the variance
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

<a id = "11"></a><br>
># <font color="green"><u> B. Hierarchical clustering:</u></font>

>> ## Dendrogram
A dendrogram is a diagram representing a tree. This diagrammatic representation is frequently used in different contexts: in hierarchical clustering, it illustrates the arrangement of the clusters produced by the corresponding analyses.

In [None]:
plt.figure(1, figsize = (16 ,8))
dendrogram = sch.dendrogram(sch.linkage(X_reduced, method  = "ward"))

plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

<a id = "12"></a><br>
># <font color="green"><u> C. Spectral clustering:</u></font>

In [None]:
# Reducing the dimensions of the data 
pca = PCA(n_components = 2) 
X_principal = pca.fit_transform(X.toarray()) 
X_principal = pd.DataFrame(X_principal) 
X_principal.columns = ['P1', 'P2'] 

In [None]:
# Building the clustering model 
spectral_model_rbf = SpectralClustering(n_clusters = 2, affinity ='rbf') 
  
# Training the model and Storing the predicted cluster labels 
labels_rbf = spectral_model_rbf.fit_predict(X_principal)

In [None]:
# Visualizing the clustering 
plt.scatter(X_principal['P1'], X_principal['P2'],  
           c = SpectralClustering(n_clusters = 2, affinity ='rbf') .fit_predict(X_principal), cmap =plt.cm.winter) 
plt.show() 

<a id = "13"></a><br>
># <font color="green"><u> D. Self-organizing maps (SOM):</u></font>

In [None]:
X_principal = X_principal.to_numpy()

In [None]:
# I use linear som topography
som_shape = (1, 5)

som = MiniSom(som_shape[0], som_shape[1], X_principal.shape[1], sigma=0.5, learning_rate=0.5)

max_iter = 1000
q_error = []
t_error = []

for i in range(max_iter):
    rand_i = np.random.randint(len(X_principal))
    som.update(X_principal[rand_i], som.winner(X_principal[rand_i]), i, max_iter)
    q_error.append(som.quantization_error(X_principal))
    t_error.append(som.topographic_error(X_principal))

plt.plot(np.arange(max_iter), q_error, label='quantization error')
plt.plot(np.arange(max_iter), t_error, label='topographic error')
plt.ylabel('Quantization error')
plt.xlabel('Iteration index')
plt.legend()
plt.show()

In [None]:
# each neuron represents a cluster
winner_coordinates = np.array([som.winner(x) for x in X_principal]).T

# with np.ravel_multi_index we convert the bidimensional
# coordinates to a monodimensional index
cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)

# Plotting the clusters 
plt.figure(figsize=(10,8))

for c in np.unique(cluster_index):
    plt.scatter(X_principal[cluster_index == c, 0],
                X_principal[cluster_index == c, 1], label='cluster='+str(c), alpha=.7)

# Plotting centroids
for centroid in som.get_weights():
    plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', 
                s=10, linewidths=20, color='k') # label='centroid'
    
plt.title("Clusters of Customers")
plt.xlabel("P1")
plt.ylabel("P2")
plt.legend();