In [None]:
# Install your required packages here
!pip install pandas numpy

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Mount google drive in colab:
from google.cloud import storage
from google.colab import drive
drive.mount('/content/drive')

# Custom implementation

Reading the top 10 words per topic into the dataframe for the custom implementation with 10 iterations. 

In [None]:
df10 = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/analysis_topics10.csv', index_col=0)
print(df10)

Reading the top 10 words per topic into the dataframe for the custom implementation with 50 iterations. 

In [None]:
df50 = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/analysis_topics50.csv', index_col=0)
print(df50)

Getting the topics that are most similar, and the amount of words that are the same in the top 10 of these topics.

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df10["Topic "+ str(i)].to_numpy()
    col2 = df50["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 10 iterations custom, Second topic: 50 iterations custom\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))

# Library implementation

Reading the top 10 words per topic into the dataframe for the library implementation with 10 iterations. 

In [None]:
df10lib = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/analysis_topics_lib10.csv', index_col=0)
print(df10lib)

Reading the top 10 words per topic into the dataframe for the library implementation with 2 iterations. 

In [None]:
df2lib = pd.read_csv('/content/drive/MyDrive/IR-Assignment-2/data/analysis_topics_lib2.csv', index_col=0)
print(df2lib)

Getting the topics that are most similar, and the amount of words that are the same in the top 10 of these topics.

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df10lib["Topic "+ str(i)].to_numpy()
    col2 = df2lib["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 10 iterations library, Second topic: 2 iterations library\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))

# Custom - Library comparison

Comparing how similar the topics are between the custom and library implementation when the same amount of iterations is used.

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df10lib["Topic "+ str(i)].to_numpy()
    col2 = df10["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 10 iterations library, Second topic: 10 iterations custom\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))

The previous results seemed okay. Now we check if maybe the custom implementation needs more iterations then the library implementation to get to the same or a more similar topic distribution.

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df10lib["Topic "+ str(i)].to_numpy()
    col2 = df50["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 10 iterations library, Second topic: 50 iterations custom\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))

Checking if 2 iterations for the library implementation approaches the results of the 10 iterations custom implementation.

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df2lib["Topic "+ str(i)].to_numpy()
    col2 = df10["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 2 iterations library, Second topic: 10 iterations custom\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))

The amount of words in common are distributed pretty equally in the above.

Let's

In [None]:
topicEquality = []

for i in range(20):
  temp = []
  for j in range(20):
    dfCompare = pd.DataFrame(columns=["topics"])
    col1 = df2lib["Topic "+ str(i)].to_numpy()
    col2 = df50["Topic "+ str(j)].to_numpy()
    concat = np.concatenate([col1, col2])
    nodes, inv, counts = np.unique(concat, return_inverse=True, return_counts=True)
    dup = np.count_nonzero(counts == 2)
    temp.append(dup)
  mVal = max(temp)
  indx = temp.index(mVal)

  topicEquality.append([indx, mVal])
print("First topic: 2 iterations library, Second topic: 50 iterations custom\n")
for i in range(len(topicEquality)):
  print('Topic {} is the most similar to topic {}, {} words in common'.format(i, topicEquality[i][0], topicEquality[i][1]))