In [1]:
from G11_code.data_collection import *
from G11_code.helper_functions import *
from G11_code.indexing import *
from G11_code.clustering import *
from G11_code.evaluation import evaluation
import seaborn as sns
from tqdm import tqdm

ModuleNotFoundError: No module named 'torch'

# Data Preparation

In [None]:
article_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "News Articles")
summary_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "Summaries")
print("Article path:", article_path)
print("Summary path:", summary_path)
_article_file_paths_by_cat, _articles_by_cat, _summary_file_paths_by_cat, _summaries_by_cat, category_names = read_files(article_path, summary_path)

In [None]:
_summary_sentence_indices_by_cat, faulty_summary_ids = get_summary_sentence_indices(_articles_by_cat, _summaries_by_cat)

In [None]:
articles_by_cat = remove_entries(_articles_by_cat, faulty_summary_ids)
articles = flatten(articles_by_cat)
article_file_paths_by_cat = remove_entries(_article_file_paths_by_cat, faulty_summary_ids)
article_file_paths = flatten(article_file_paths_by_cat)
summaries_by_cat = remove_entries(_summaries_by_cat, faulty_summary_ids)
summaries = flatten(summaries_by_cat)
summary_file_paths_by_cat = remove_entries(_summary_file_paths_by_cat, faulty_summary_ids)
summary_file_paths = flatten(summary_file_paths_by_cat)
summary_sentence_indices_by_cat = remove_entries(_summary_sentence_indices_by_cat, faulty_summary_ids)
summary_sentence_indices = flatten(summary_sentence_indices_by_cat)

In [None]:
dict_path_to_articleID = {path:i for i, path in enumerate(article_file_paths)}
def map_path_to_articleID(path):
    path = os.path.normpath(path)
    return dict_path_to_articleID.get(path)

# Options

In [None]:
path = os.path.join(article_path, 'tech', '199.txt')
d = map_path_to_articleID(path)
compute_index = 1

# Experiments

## Unupervised

In [None]:
match compute_index:
    case 0 :
        I = InvertedIndex(0,0)
    case 1:
        index_path = './index/Index.pkl'
        I = indexing(None, index_path = index_path)
    case 2:
        I = indexing(articles)

In [None]:
print(I.doc_to_string(600))

In [None]:
nltk.help.upenn_tagset('.*')

In [None]:
# load embeddings 
sentence_embeddings_path = os.path.join('./embeddings', 'sentence_embeddings.pkl')
sentence_embeddings_by_cat = pickle_load(sentence_embeddings_path)
document_embeddings_path = os.path.join('./embeddings', 'document_embeddings.pkl')
document_embeddings_by_cat = pickle_load(document_embeddings_path)

In [None]:
np.random.seed(42)
d=0
b = 0.5
k = 1
dM = bert_compute_dissimilarity_matrix(d, file_path=sentence_embeddings_path)
sim2diss1 = lambda S: np.exp(-k(S+b))
sim2diss2 = lambda S: (2/np.pi) * np.arccos((1-b)*S+b)
sim2diss3 = lambda S: b*(1-np.log(1+k*S)/np.log(1+k))
dM2 = tf_idf_compute_dissimilarity_matrix(d, I, conversion_function=sim2diss2)

In [None]:
Q1, Q2, Q3 = np.percentile(dM2.compressed(), [25, 50, 75])
IQR = Q3 - Q1
ul = np.mean(dM2) + 1.5 * IQR
np.where(dM2 >= Q3)

In [None]:
sns.kdeplot(dM.compressed())

In [None]:
sns.kdeplot(dM2.compressed())

In [None]:
n_clust, (labels,_) = sentence_clustering(dM2, algorithm='agglomerative', linkage='complete', kmax=len(dM2)//2)
n_clust, len(labels), labels 

In [None]:
n_clust, (labels, indices) = sentence_clustering(dM2, kmax=len(dM)//2)
n_clust, len(labels), labels, indices

In [None]:
a = lambda y: lambda x: x+y
b = a(1)
b(8)

In [None]:
a = TextBlob('Hello, I\'m Inigo Montoya. You killed my wife. Prepare to die. You are dead.')

In [None]:
from collections import Counter
tokenizer = RegexpTokenizer(r'[\w|-]+')
sent = "I'm appaled that they would refuse to permit us to obtain the state-of-the-art refuse permit"
a = nltk.word_tokenize(sent)
a = tokenizer.tokenize(sent)
for term in a:
    print(term.lower())
    break
c = Counter(nltk.pos_tag(a))
c_ = defaultdict(list)
for ((term,pos), count) in c.items():
    c_[term].append((pos, count))
c_

In [None]:
n_cl, clusters = transform_labels(labels)
sorted(zip(clusters, indices), key = lambda tup: len(tup[0]), reverse=True)

In [None]:
clusters

In [None]:
silh = silhouette_samples(dM2, labels, metric='precomputed')
silh_cl = [(silh[clust], clust[np.argmax(silh[clust])]) for clust in clusters]
silh_cl

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(dM2, labels, metric='precomputed')

In [None]:
summarization(dM2, labels)

In [None]:
clusters = transform_labels(labels)[1]
len([c for c in clusters if len(c) > 3])

In [None]:
def summarization_function(d, embed_method, clustering_algorithm, remove_outliers, find_subtopics, evaluate=None, linkage=None, ignore_medoid_centers=False):
    match embed_method:
        case 'tfidf':
            dM = tf_idf_compute_dissimilarity_matrix(d, I)
        case 'bert':
            dM = bert_compute_dissimilarity_matrix(d, file_path=sentence_embeddings_path)
    kmax=len(dM)//2

    n_clust, (labels, cluster_centers) = sentence_clustering(dM, algorithm=clustering_algorithm, kmax=kmax, linkage=linkage, evaluate=evaluate)

    cluster_centers = (not ignore_medoid_centers and cluster_centers) or None    
    return summarization(dM, labels, remove_outliers=remove_outliers, find_subtopics=find_subtopics, cluster_centers=cluster_centers)

In [None]:
embed_method = 'tfidf'
clustering_algorithm = 'agglomerative'
remove_outliers = True
find_subtopics = False
evaluate = lambda dm,labs: silhouette_score(dm, labs, metric='precomputed')
linkage = 'average'
ignore_medoid_centers = True
args = (embed_method, clustering_algorithm, remove_outliers, find_subtopics, evaluate, linkage, ignore_medoid_centers)
summarize = lambda d: summarization_function(d, *args)

all_accs = summary_compute(article_file_paths_by_cat, summarize, map_path_to_articleID)

In [None]:
evaluation()

In [None]:
n_clust, (labels,_) = sentence_clustering(dM2, algorithm='k-medoids', linkage='complete', kmax=len(dM2)//2)

In [None]:
res = keyword_extraction(0, labels, I)
res

## Finding the parameters for the sim2diss func via regression (just for fun)

In [None]:
ss = [tf_idf_compute_dissimilarity_matrix(d_i, I, conversion_function=lambda S: S).compressed() for d_i in tqdm(range(len(articles)))]

In [None]:
ds = [np.log(bert_compute_dissimilarity_matrix(d_i, file_path=sentence_embeddings_path).compressed()) for d_i in tqdm(range(len(articles)))]

In [None]:
fail = set([761, 1182, 1757])
a1 = np.array([np.mean(sims) for i,sims in enumerate(ss) if i not in fail])
b = np.array([np.mean(diss) for i,diss in enumerate(ds) if i not in fail])
a2 = np.ones_like(a1)
a = np.c_[a1, a2]
x = np.linalg.lstsq(a,b,rcond=None)

In [None]:
x

In [None]:
fst = -x[0][0]
snd = -x[0][1]/fst
fst,snd

In [None]:
import scipy.optimize
s_train = np.array(flatten([sims for i,sims in enumerate(ss) if i not in fail]))
d_train = np.array(flatten([diss for i,diss in enumerate(ds) if i not in fail]))
def fun(x, s, d):
    return np.exp(-x[0]*(s+x[1])) - d
res = scipy.optimize.least_squares(fun, np.array([1.5,1.5]), args=(s_train,d_train))
res.x

## Supervised

In [None]:
from random import random
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed

# create a sequence classification instance
def get_sequence(n_timesteps):
	# create a sequence of random numbers in [0,1]
	X = array([random() for _ in range(n_timesteps)])
	# calculate cut-off value to change class values
	limit = n_timesteps/4.0
	# determine the class outcome for each item in cumulative sequence
	y = array([0 if x < limit else 1 for x in cumsum(X)])
	# reshape input and output data to be suitable for LSTMs
	X = X.reshape(1, n_timesteps, 1)
	y = y.reshape(1, n_timesteps, 1)
	return X, y

# define problem properties
n_timesteps = 10
# define LSTM
model = Sequential()
model.add(LSTM(20, input_shape=(None, 1), return_sequences=True))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# train LSTM
for epoch in range(1000):
	# generate new random sequence
	X,y = get_sequence(n_timesteps)
	# fit model for one epoch on this sequence
	model.fit(X, y, epochs=1, batch_size=1, verbose=2)
# evaluate LSTM
X,y = get_sequence(n_timesteps)
yhat = model.predict_classes(X, verbose=0)
for i in range(n_timesteps):
	print('Expected:', y[0, i], 'Predicted', yhat[0, i])

In [None]:

from random import random
import tqdm
import numpy.random as rnd 
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional

In [None]:
import tensorflow as tf
print(tf.__version__)


In [None]:
import keras
keras.__version__

In [None]:
# create a sequence classification instance
def get_sequence(n_timesteps):
	# create a sequence of random numbers in [0,1]
	X = array([(random(),random()) for _ in range(n_timesteps)])
	# calculate cut-off value to change class values
	limit = n_timesteps/4.0
	# determine the class outcome for each item in cumulative sequence
	y = array([0 if x[0] < limit else 1 for x in cumsum(X,axis=0)])
	# reshape input and output data to be suitable for LSTMs
	X = X.reshape(1, n_timesteps, 2)
	y = y.reshape(1, n_timesteps, 1)
	return X, y

# define LSTM
model = Sequential()
model.add(Bidirectional(LSTM(20, return_sequences=True), input_shape=(None, 2)))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# train LSTM
for epoch in range(500):
    #print(epoch, n_timesteps)
    n_timesteps = rnd.randint(2,20)
	# generate new random sequence
    X,y = get_sequence(n_timesteps)
    print(X.shape, y.shape)
	# fit model for one epoch on this sequence
    model.fit(X, y, epochs=1, batch_size=1, verbose=2)

In [None]:
model = keras.Sequential([
 keras.layers.SimpleRNN(input_shape= (None, 2),units=50, return_sequences= True),
 keras.layers.Dropout(0.2),
 keras.layers.SimpleRNN(units=50, return_sequences= True),
 keras.layers.Dropout(0.2),
 keras.layers.SimpleRNN(units=50, return_sequences= True),
 keras.layers.Dropout(0.2),
 keras.layers.Flatten(),
 keras.layers.Dense(10, activation='relu'),
 keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
 optimizer='adam',
 metrics=['accuracy'])
model.build()