In [1]:
import numpy as np
import pandas as pd
from BERT import bert_embed_text
from tqdm import tqdm
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch_functions as torch_fns
from torchinfo import summary
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import word_wizard as ww

In [2]:
# loading the articles to embed
articles = pd.read_csv("../data/clean/MicrosoftANDquantum_None.csv")
# remove rows with na as body
articles = articles.dropna(subset=['body'])
articles.head()

Unnamed: 0,engine,link,se_description,se_source,n3k_author,n3k_published,title,body
0,Yahoo,https://seekingalpha.com/article/4603046-micro...,Quantum computing is a disruptive technology t...,Seeking Alpha,['Aseity Research'],2023-05-11 11:03:43-04:00,Microsoft Stock: Leading The Quantum Computing...,Bartlomiej Wroblewski\n\nQuantum computing is ...
1,Yahoo,https://www.morningstar.com/news/pr-newswire/2...,Quantum Computing Inc. initiates commercializa...,Morningstar,[],2023-05-11 15:01:00-05:00,Quantum Computing Inc Announces First Quarter ...,Quantum Computing Inc. initiates commercializa...
2,Bing,https://phys.org/news/2023-05-google-quantum-a...,Our intuition tells us that it should be impos...,Phys.org,['Google Quantum Ai'],,Google Quantum AI braids non-Abelian anyons fo...,This article has been reviewed according to Sc...
3,Yahoo,https://www.msn.com/en-us/money/savingandinves...,Technology giant Microsoft (NASDAQ:MSFT) is ta...,MSN News,[],,Microsoft (NASDAQ:MSFT) Strategizes to Curb Co...,Technology giant Microsoft (NASDAQ:MSFT) is ta...
4,Yahoo,https://www.techtarget.com/searchstorage/tip/A...,"In 2017, Microsoft surprised attendees of its ...",SearchSecurity.com,['Published'],,A primer on quantum computing storage and memo...,"In 2017, Microsoft surprised attendees of its ..."


In [3]:
articles.iloc[[169]]

Unnamed: 0,engine,link,se_description,se_source,n3k_author,n3k_published,title,body
170,Google,https://cloudblogs.microsoft.com/quantum/2022/...,"Get the inside, first-hand account of Microsof...",Microsoft Cloud Blogs,['Microsoft Azure Quantum Team'],2022-12-05 00:00:00,Microsoft Quantum Innovator Series: The path t...,This embed requires accepting cookies from the...


In [4]:
lean_wizard = ww.WordWizard(df=articles, lean=True)
bulky_wizard = ww.WordWizard(df=articles, lean=False)

In [5]:
lean = lean_wizard.create_embeddings(columns=['body'])

Creating embeddings for column: ['body']: 100%|██████████| 1/1 [00:26<00:00, 26.66s/it]


In [6]:
bulky = bulky_wizard.create_embeddings(columns=['body', 'title'])

Creating embeddings for column: ['body', 'title']: 100%|██████████| 2/2 [01:31<00:00, 45.92s/it]


In [8]:
lean[["title", "body", "body_embedded"]].head()

Unnamed: 0,title,body,body_embedded
0,Microsoft Stock: Leading The Quantum Computing...,Bartlomiej Wroblewski\n\nQuantum computing is ...,"[0.4255876, -0.14641741, 0.09089464, -0.096109..."
1,Quantum Computing Inc Announces First Quarter ...,Quantum Computing Inc. initiates commercializa...,"[0.40670347, -0.07659096, 0.02964771, -0.00834..."
2,Google Quantum AI braids non-Abelian anyons fo...,This article has been reviewed according to Sc...,"[0.5094838, -0.29704052, 0.02276769, -0.083231..."
3,Microsoft (NASDAQ:MSFT) Strategizes to Curb Co...,Technology giant Microsoft (NASDAQ:MSFT) is ta...,"[0.2006045, -0.106501766, 0.0603969, 0.0034779..."
4,A primer on quantum computing storage and memo...,"In 2017, Microsoft surprised attendees of its ...","[0.47235262, -0.03634747, 0.08287287, -0.00015..."


In [9]:
bulky[["title", "body", "body_embedded", "title_embedded"]].head()

Unnamed: 0,title,body,body_embedded,title_embedded
0,Microsoft Stock: Leading The Quantum Computing...,Bartlomiej Wroblewski\n\nQuantum computing is ...,"[0.15203002, 0.3710277, -0.070250034, 0.227758...","[-0.37368658, 0.3039311, 0.0544124, 0.13620754..."
1,Quantum Computing Inc Announces First Quarter ...,Quantum Computing Inc. initiates commercializa...,"[0.07547751, 0.08461023, 0.039843753, 0.019642...","[-0.12075205, 0.17395379, 0.06403588, 0.012568..."
2,Google Quantum AI braids non-Abelian anyons fo...,This article has been reviewed according to Sc...,"[0.21349677, 0.37277052, -0.13767469, 0.155252...","[0.31152043, 0.11549268, -0.2891103, 0.1591472..."
3,Microsoft (NASDAQ:MSFT) Strategizes to Curb Co...,Technology giant Microsoft (NASDAQ:MSFT) is ta...,"[0.07998328, 0.29852116, 0.12375433, 0.0142973...","[-0.4091746, 0.33247614, 0.017602623, -0.06984..."
4,A primer on quantum computing storage and memo...,"In 2017, Microsoft surprised attendees of its ...","[0.25351882, 0.24780278, -0.101479046, 0.18780...","[0.02133733, -0.07428073, -0.05858498, 0.19360..."


In [None]:
# Setup device agnostic code (Chooses NVIDIA or Metal backend if available, otherwise defaults to CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    
else:
    device = torch.device("cpu")
device

In [None]:
articles.head(3)

In [None]:
articles.set_index(["link", ""], inplace=True)

### Uncomment below cell when running notebook for the first time

In [None]:
#An alternative would be to try summarizing by paragraph (we shall do that soon)

# articles['body_embedding'] = None
# for i in tqdm(range(len(articles))):
#     try:
#         articles.at[i,'body_embedding'] = bert_embed_text(articles.at[i,'body']).tolist()
#     except:
#         articles.at[i,'body_embedding'] = []

# articles.to_parquet('../data/misc/quant_embed.gzip',compression='gzip', index=False)

In [None]:
bert_embed_text = pd.read_parquet('../data/misc/quant_embed.gzip')
sentence_embeddings_df = pd.DataFrame(bert_embed_text['body_embedding'].tolist(), index=bert_embed_text.index).dropna().values
sentence_embeddings = MinMaxScaler().fit_transform(np.array(sentence_embeddings_df))

In [None]:
bert_embed_text

In [None]:
data = torch.from_numpy(sentence_embeddings).type(torch.float32)

In [None]:
data.shape, data.dtype, data.max(), data.min()

In [None]:
# Setup hyperparameters
NUM_EPOCHS = 200
BATCH_SIZE = 16
INPUT_SHAPE = 768
HIDDEN_UNITS = list([256, 16]) # For some reason fails when just doing [] kekw
OUTPUT_SHAPE = 768
LEARNING_RATE = 0.0002 #Default adam is 1e-3
DROPOUT = 0.2

In [None]:
# Setup DataLoaders
from os import cpu_count
dataloader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [None]:
next(iter(dataloader)).shape, len(dataloader)

In [None]:
# Instantiate model
model = torch_fns.Autoencoder(INPUT_SHAPE, HIDDEN_UNITS, OUTPUT_SHAPE, DROPOUT)

In [None]:
# Print a summary using torchinfo
summary(model=model, 
        input_size=(BATCH_SIZE, INPUT_SHAPE),
        # col_names=["input_size"],
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
) 

In [None]:
# Set loss and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Start training with help from torch_functions.py
results = torch_fns.train(
    model=model,
    dataloader=dataloader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    epochs=NUM_EPOCHS,
    device=device,
)


In [None]:
torch_fns.plot_loss_curves(results)

In [None]:
with torch.inference_mode():
    model = model.to(device)
    data = data.to(device)
    encoded = model.encode(data)
    decoded = model.decode(encoded)
    error = loss_fn(decoded, data).item()
    enc = encoded.cpu().numpy()
    dec = decoded.cpu().numpy()
    data = data.cpu().numpy()

print(f'Root mean squared error: {np.sqrt(error):.4f}')

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 6))

ax1.scatter(sentence_embeddings[:,0], sentence_embeddings[:,1])
ax1.set_title('Original')

ax2.scatter(enc[:,0], enc[:,1])
ax2.set_title('Encoded')

ax3.scatter(dec[:,0], dec[:,1])
ax3.set_title('Decoded')

plt.show()

In [None]:
data.shape, enc.shape

In [None]:
# Remove outliers from encoded data using iqr using numpy
def remove_outliers_iqr(data, iqr_range=1.5):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr_range * iqr)
    upper_bound = q3 + (iqr_range * iqr)
    return data[(data > lower_bound) & (data < upper_bound)]

enc = remove_outliers_iqr(enc)
enc.shape

In [None]:
# Perform clustering
n_clusters = 4  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters)
cluster_labels = kmeans.fit_predict(enc)

# Calculate cluster means
cluster_centers = kmeans.cluster_centers_

# Create scatter plot
plt.figure(figsize=(8, 6))
for cluster_label in range(n_clusters):
    cluster_points = enc[cluster_labels == cluster_label]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster_label + 1}')

# Add legend
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker='o', s=150, c='black', edgecolors='white', label='Cluster Mean')
plt.legend()

# Show the plot
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Clustering of Dimensionality-Reduced Embeddings')
plt.show()


In [None]:
##Dynamic retrival of no of clusters
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
best_clusters = 0 # best cluster number which you will get
previous_silh_avg = 0.0
for n_clusters in range(2, 10):
    clusterer = KMeans(n_clusters, init="k-means++", n_init="auto", random_state=42)
    cluster_labels = clusterer.fit_predict(iqr)
    silhouette_avg = silhouette_score(enc, cluster_labels, sample_size=3000)
    if silhouette_avg > previous_silh_avg:
        previous_silh_avg = silhouette_avg
        best_clusters = n_clusters
print("No of Clusters:", best_clusters)

##K-Means Implementation
num_clusters = best_clusters  
km = KMeans(n_clusters=num_clusters, init="k-means++", n_init="auto", random_state=42)  
km.fit(iqr)  
clusters = km.labels_.tolist()  

In [None]:
# Plot Clusters

plt.scatter(enc[:,0], enc[:,1], c=clusters, s=50, cmap='viridis')