<H3>PRI 2023/24: second
    project delivery</H3>

**GROUP 11**
- Francisco Martins, 99068
- Tunahan Güneş, 108108
- Sebastian Weidinger, 111612

# Necessary modules

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
from G11_code.data_collection import *
from G11_code.helper_functions import *
from G11_code.indexing import *
from G11_code.evaluation import *
from G11_code.clustering import *
from G11_code.supervised_classification import *

# Data Preparation

In [None]:
article_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "News Articles")
summary_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "Summaries")
print("Article path:", article_path)
print("Summary path:", summary_path)
_article_file_paths_by_cat, _articles_by_cat, _summary_file_paths_by_cat, _summaries_by_cat, category_names = read_files(article_path, summary_path)

In [None]:
_summary_file_paths_by_cat[0][0]

In [None]:
_summary_sentence_indices_by_cat, faulty_summary_ids = get_summary_sentence_indices(_articles_by_cat, _summaries_by_cat)

In [None]:
faulty_summary_ids

[(1, 247), (1, 267), (1, 351), (3, 110), (3, 138)]

In [None]:
_summary_sentence_indices_by_cat[0][0]

[0, 2, 3, 6, 7, 10, 12, 14]

In [None]:
articles_by_cat = remove_entries(_articles_by_cat, faulty_summary_ids)
articles = flatten(articles_by_cat)
article_file_paths_by_cat = remove_entries(_article_file_paths_by_cat, faulty_summary_ids)
article_file_paths = flatten(article_file_paths_by_cat)
summaries_by_cat = remove_entries(_summaries_by_cat, faulty_summary_ids)
summaries = flatten(summaries_by_cat)
summary_file_paths_by_cat = remove_entries(_summary_file_paths_by_cat, faulty_summary_ids)
summary_file_paths = flatten(summary_file_paths_by_cat)
summary_sentence_indices_by_cat = remove_entries(_summary_sentence_indices_by_cat, faulty_summary_ids)
summary_sentence_indices = flatten(summary_sentence_indices_by_cat)

In [None]:
summary_sentence_indices[0]

[0, 2, 3, 6, 7, 10, 12, 14]

In [None]:
dict_path_to_articleID = {path:i for i, path in enumerate(article_file_paths)}
def map_path_to_articleID(path):
    path = os.path.normpath(path)
    return dict_path_to_articleID.get(path)

## Get BERT embeddings

In [None]:
pretrained_weights = 'distilbert-base-uncased'
bert_tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)
bert_model = DistilBertModel.from_pretrained(pretrained_weights)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
bert_params = (bert_tokenizer, bert_model, device)

In [None]:
# save sentence and document embeddings 
path = './embeddings'
if not os.listdir(path): 
    save_embeddings(articles, bert_tokenizer, bert_model, device, path)
else: 
    print(f'Files exist in folder {path}')

In [None]:
# load embeddings 
sentence_embeddings_path = os.path.join('./embeddings', 'sentence_embeddings.pkl')
sentence_embeddings_by_cat = pickle_load(sentence_embeddings_path)
document_embeddings_path = os.path.join('./embeddings', 'document_embeddings.pkl')
document_embeddings_by_cat = pickle_load(document_embeddings_path)

# Options

In [None]:
path = os.path.join(article_path, 'tech', '199.txt')
d = map_path_to_articleID(path)
compute_index = 1

In [None]:
match compute_index:
    case 0 :
        I = InvertedIndex(0,0)
    case 1:
        index_path = './index/Index.pkl'
        I = indexing(None, index_path = index_path)
    case 2:
        I = indexing(articles)

# Checking the ratio: size of article/size of summary

In [None]:
ratios = np.array([len(nltk.sent_tokenize(articles[i]))/len(summary_sentence_indices[i]) for i in range(len(articles))])
plt.boxplot(ratios)
None

# Check different behavior of sentence similarity between BERT and TFIDF

In [None]:
# print two closest sentences according to bert
for d in (0, 10, 600, 610, 900, 910):
    bert_params = (bert_tokenizer, bert_model, device)
    dissimilarity_matrix_bert = bert_compute_dissimilarity_matrix(d, articles, bert_params)
    index_closest_bert = np.argmin(dissimilarity_matrix_bert)
    tokenized_article = nltk.sent_tokenize(articles[d])
    num_sent = len(tokenized_article)
    sent1 = index_closest_bert//num_sent
    sent2 = index_closest_bert%num_sent
    print(tokenized_article[sent1])
    print(tokenized_article[sent2])
    print(dissimilarity_matrix_bert[sent1, sent2])
    print('---')

In [None]:
# print two closest sentences according to tfidf
for d in (0, 10, 600, 610, 900, 910):
    dissimilarity_matrix_tfidf = tf_idf_compute_dissimilarity_matrix(d, I)
    index_closest_tfidf = np.argmin(dissimilarity_matrix_tfidf)
    tokenized_article = nltk.sent_tokenize(articles[d])
    num_sent = len(tokenized_article)
    sent1 = index_closest_tfidf//num_sent
    sent2 = index_closest_tfidf%num_sent
    print(tokenized_article[sent1])
    print(tokenized_article[sent2])
    print(dissimilarity_matrix_tfidf[sent1, sent2])
    print('---')

# Training

In [None]:
from G11_code.training import *

train_emb_by_cat, test_emb_by_cat, train_ind_by_cat, test_ind_by_cat = split_by_cat(sentence_embeddings_by_cat, summary_sentence_indices_by_cat)

X_train, Y_train = get_XY(train_emb_by_cat, train_ind_by_cat)
X_test, Y_test = get_XY(test_emb_by_cat, test_ind_by_cat)
X_train = flatten(X_train)
X_test = flatten(X_test)
Y_train = flatten(Y_train)
Y_test = flatten(Y_test)

In [None]:
n_components = 200

Number of Articles in 'tech' Category: 510
Number of Articles in 'entertainment' Category: 386
Number of Articles in 'sport' Category: 417
Number of Articles in 'business' Category: 511
Number of Articles in 'politics' Category: 401

In [None]:
# visualize documents by category
categories = ['tech', 'entertainment', 'sport', 'business', 'politics']
all_doc_emb = flatten(flatten(document_embeddings_by_cat))
#reducer = fit_UMAP(X_train, n_components=2)
reducer = fit_UMAP(all_doc_emb, n_components=2)
for i, cat in enumerate(document_embeddings_by_cat): 
    samples = np.array(flatten(cat)) 
    #idx = np.random.randint(samples.shape[0], size=100)
    #samples = samples[idx,:]
    X_train_trans = transform_UMAP(reducer, samples)
    plt.scatter(X_train_trans[:,0], X_train_trans[:,1], label=categories[i])
plt.legend(categories)
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()
#X_test_trans = transform_UMAP(reducer, X_test)

In [None]:
pca = fit_PCA(X_train, n_components=n_components)
X_train_trans = transform_PCA(pca, X_train)
X_test_trans = transform_PCA(pca, X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

xgb_model = training(train_emb_by_cat, train_ind_by_cat, model_name="XGBoost", use_pca=True, n_components=n_components)

              precision    recall  f1-score   support

           0       0.62      0.75      0.68      4970
           1       0.52      0.37      0.43      3609

    accuracy                           0.59      8579
   macro avg       0.57      0.56      0.55      8579
weighted avg       0.58      0.59      0.57      8579

In [None]:
predictions = xgb_model.predict(X_test_trans)
accuracy = accuracy_score(Y_test, predictions)
print(classification_report(Y_test, predictions))

In [None]:
precision, recall, auc = supervised_evaluation(test_emb_by_cat, test_ind_by_cat, xgb_model, model_name="XGBoost", use_pca=True, X_train=X_train, n_components=n_components)
#precision, recall, auc = supervised_evaluation(test_emb_by_cat, test_ind_by_cat, model, model_name="XGBoost")
print(f'Classifier XGBoost - Precision: {precision} | Recall: {recall} | AUC: {auc}')

In [None]:
supervised_summarization()

In [None]:
if os.listdir('./nn_model'):
    nn_model = keras.models.load_model('./nn_model/final_model')
    nn_model.load_weights('./nn_model/final_model')
else:
    nn_model = training(train_emb_by_cat, train_ind_by_cat, model_name="NN", use_pca=True, n_components=n_components)
predictions = np.round(nn_model.predict(X_test_trans)).astype(int)
print(classification_report(Y_test, predictions))

In [None]:
precision, recall, auc = supervised_evaluation(test_emb_by_cat, test_ind_by_cat, nn_model, model_name="NN", use_pca=True, X_train=X_train, n_components=n_components)
print(f'Classifier NN - Precision: {precision} | Recall: {recall} | AUC: {auc}')

In [None]:
if not os.listdir('./nn_model'):
    nn_model.save('./nn_model/final_model', save_format='tf')
    nn_model.save_weights('./nn_model/final_model')

In [None]:
if os.listdir('./lstm_model'):
    lstm_model = keras.models.load_model('./lstm_model/final_model')
    lstm_model.load_weights('./lstm_model/final_model')
else:
    lstm_model = training(train_emb_by_cat, train_ind_by_cat, model_name="LSTM", use_pca=True, n_components=n_components)

In [None]:
if not os.listdir('./lstm_model'):
    lstm_model.save_weights('./lstm_model/final_model')

In [None]:
precision, recall, auc = supervised_evaluation(test_emb_by_cat, test_ind_by_cat, lstm_model, model_name="LSTM", use_pca=True, X_train=X_train, n_components=n_components)
print(f'Classifier LSTM - Precision: {precision} | Recall: {recall} | AUC: {auc}')

In [None]:
if not os.listdir('./lstm_model'):
    lstm_model.save('./lstm_model/final_model', save_format='tf')
    lstm_model.save_weights('./lstm_model/final_model')

In [None]:
'''
X_test, y_test = get_XY(test_emb_by_cat, test_ind_by_cat)
X_test_trans = [transform_PCA(pca, x) for x in X_test]
all_predictions = list()
all_ytest = np.array(flatten(y_test))
for X, y in zip(X_test_trans, y_test): 
    #X = np.array(transform_PCA(pca, X))
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    y = np.array(y)
    predictions = lstm_model.predict(X, verbose=0)
    predictions = np.round(predictions.squeeze()).astype(int)
    all_predictions.extend(predictions)
precision = sklearn.metrics.precision_score(all_ytest, all_predictions)
recall = sklearn.metrics.recall_score(all_ytest, all_predictions)
fpr, tpr, thresholds = sklearn.metrics.roc_curve(all_ytest, all_predictions)  
auc = sklearn.metrics.auc(fpr, tpr)
   
print(precision)
print(recall)
print(auc)
print(classification_report(all_ytest, all_predictions))
'''

In [None]:
'''
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Bidirectional, Dropout
from keras.metrics import Recall

LSTM_units = 32

model = Sequential()
model.add(Bidirectional(LSTM(units=LSTM_units, 
               input_shape=(None, n_components), 
               return_sequences=True)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(units=LSTM_units, return_sequences=True)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(units=LSTM_units, return_sequences=True)))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["AUC"])
model.build(input_shape=(None, None, n_components))
print(model.summary())
'''

In [None]:
'''
from itertools import groupby
X_train, Y_train = get_XY(train_emb_by_cat, train_ind_by_cat)
X_train_trans = [transform_PCA(pca, x) for x in X_train]
Y_train = [y for _, y in sorted(zip(X_train_trans, Y_train), key=lambda x: len(x[0]))]
X_train_trans.sort(key=len)
X_train_groups = [list(g) for k, g in groupby(X_train_trans, key=len)]
Y_train_groups = [list(g) for k, g in groupby(Y_train, key=len)]
'''

In [None]:
'''
X_test, y_test = get_XY(test_emb_by_cat, test_ind_by_cat)
X_test_trans = [transform_PCA(pca, x) for x in X_test]

y_t = list()
all_predictions = list()
n = 100  # for 2 random indices
index = np.random.choice(len(X_test_trans), n, replace=False)
for k in index: 
    y_t.extend(y_test[k])

epochs = 100
for i in range(epochs):
    for X, y in zip(X_train_groups, Y_train_groups):
        X = np.array(X)
        X = X.reshape(len(X), len(X[0]), n_components)
        y = np.array(y)
        y = y.reshape(len(y), len(y[0]), 1)
        history = model.fit(X, y, epochs=1, batch_size=32, verbose=0, shuffle=True)
    avg_loss = np.mean(history.history['loss'][-100:])
    avg_auc = np.mean(history.history['auc'][-100:])

    all_predictions = list()
    for k in index: 
        x = X_test_trans[k]
        x = np.expand_dims(x, axis=0)
        predictions = model.predict(x, verbose=0)
        predictions = np.round(predictions.squeeze()).astype(int)
        all_predictions.extend(predictions)
    print(classification_report(y_t, all_predictions))
    print(f"Loss:{avg_loss} | AUC:{avg_auc}")
'''

# Main facilities

### Part I: clustering

#### Comparing the different dissimilarities

In [None]:
dissimilarity_matrix_tfidf = tf_idf_compute_dissimilarity_matrix(d, I)
plt.matshow(dissimilarity_matrix_tfidf)
plt.colorbar()

In [None]:
plt.matshow(dissimilarity_matrix_bert)
plt.colorbar()

#### *A) Clustering*

In [None]:
#code, statistics and/or charts here
b = 0.5
k = 1
dM = bert_compute_dissimilarity_matrix(d, file_path=sentence_embeddings_path)
sim2diss1 = lambda S: np.exp(-k(S+b))
sim2diss2 = lambda S: (2/np.pi) * np.arccos((1-b)*S+b)
sim2diss3 = lambda S: b*(1-np.log(1+k*S)/np.log(1+k))
dM2 = tf_idf_compute_dissimilarity_matrix(d, I, conversion_function=sim2diss2)

n_clust, labels = sentence_clustering(dM, algorithm='agglomerative', linkage='complete', kmax=len(dM)//2)
print(n_clust, len(labels), labels)
n_clust, labels = sentence_clustering(dM2, algorithm='agglomerative', linkage='complete', kmax=len(dM2)//2)
print(n_clust, len(labels), labels)

#### *B) Summarization*

In [None]:
#code, statistics and/or charts here

#### *C) Keyword extraction*

In [None]:
#code, statistics and/or charts here

#### *D) Evaluation*

In [None]:
#code, statistics and/or charts here

### Part II: classification

#### *A) Feature extraction*

In [None]:
article_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "News Articles")
summary_path = os.path.join("..", "BBC News Summary", "BBC News Summary", "Summaries")
_article_file_paths_by_cat, _articles_by_cat, _summary_file_paths_by_cat, _summaries_by_cat, category_names = read_files(article_path, summary_path)
_summary_sentence_indices_by_cat, faulty_summary_ids = get_summary_sentence_indices(_articles_by_cat, _summaries_by_cat)
articles_by_cat = remove_entries(_articles_by_cat, faulty_summary_ids)
articles = flatten(articles_by_cat)
article_file_paths_by_cat = remove_entries(_article_file_paths_by_cat, faulty_summary_ids)
article_file_paths = flatten(article_file_paths_by_cat)
summaries_by_cat = remove_entries(_summaries_by_cat, faulty_summary_ids)
summaries = flatten(summaries_by_cat)
summary_file_paths_by_cat = remove_entries(_summary_file_paths_by_cat, faulty_summary_ids)
summary_file_paths = flatten(summary_file_paths_by_cat)
summary_sentence_indices_by_cat = remove_entries(_summary_sentence_indices_by_cat, faulty_summary_ids)
summary_sentence_indices = flatten(summary_sentence_indices_by_cat)

In [None]:
# load embeddings 
sentence_embeddings_path = os.path.join('./embeddings', 'sentence_embeddings.pkl')
sentence_embeddings_by_cat = pickle_load(sentence_embeddings_path)
sentence_embeddings = flatten(sentence_embeddings_by_cat)
document_embeddings_path = os.path.join('./embeddings', 'document_embeddings.pkl')
document_embeddings_by_cat = pickle_load(document_embeddings_path)
document_embeddings = flatten(document_embeddings_by_cat)

In [None]:
doc_ids_by_cat = generate_doc_ids_cat()
X_train, y_train, X_test, y_test = construct_df_and_split(doc_ids_by_cat=doc_ids_by_cat,
                                                          summary_sentence_indices_by_cat=summary_sentence_indices_by_cat,
                                                          sent_embeddings=sentence_embeddings,
                                                          doc_embeddings=document_embeddings,
                                                          article_file_paths=article_file_paths,
                                                          articles=articles,
                                                          train_size=0.8,
                                                          k=0.2,
                                                          b=0.75,
                                                          p_keywords=10)
#X_train.to_csv('./dataframes/X_train.csv', index=False)
#np.save('./dataframes/y_train.npy', y_train)
#X_test.to_csv('./dataframes/X_test.csv', index=False)
#np.save('./dataframes/y_test.npy', y_test)

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
from G11_code.data_collection import *
from G11_code.helper_functions import *
from G11_code.indexing import *
from G11_code.evaluation import *
from G11_code.clustering import *
from G11_code.supervised_classification import *

In [None]:
X_train = pd.read_csv('./dataframes/X_train.csv')
y_train = np.load('./dataframes/y_train.npy')
X_test = pd.read_csv('./dataframes/X_test.csv')
y_test = np.load('./dataframes/y_test.npy')

In [None]:
from sklearn.metrics import accuracy_score, classification_report

#model = training(train_emb_by_cat, train_ind_by_cat, model_name="XGBoost", use_pca=True, n_components=n_components)
model = training(X_train.iloc[:,1:], y_train, model_name="XGBoost", use_extracted_features=True)

predictions = model.predict(X_test.iloc[:,1:])

accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.76      0.80      0.78      4958
           1       0.71      0.66      0.68      3602

    accuracy                           0.74      8560
   macro avg       0.74      0.73      0.73      8560
weighted avg       0.74      0.74      0.74      8560

In [None]:
from sklearn.metrics import accuracy_score, classification_report

#model = training(train_emb_by_cat, train_ind_by_cat, model_name="XGBoost", use_pca=True, n_components=n_components)
model = training(X_train.iloc[:,1:], y_train, model_name="NN", use_extracted_features=True,use_val=True, X_val=X_test.iloc[:,1:], y_val=y_test)

predictions = np.rint(model.predict(X_test.iloc[:,1:]))
precitions = np.squeeze(predictions)
accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test, predictions))

In [None]:
I = indexing(articles) #Put the existing index. Only needed for char lengths of the sentences.
supervised_summarization(d=486, M=model, p=7, l=0, o='rel', x_test=X_test, I = I)

article_id = 486
print("ORIGINAL DOCUMENT")
print(articles[article_id])
all_accs = summarization(d=article_id, p=7, l=1000, o="app", I_or_D=I, model='TF-IDF')

print("SUMMARY")
sentences = nltk.sent_tokenize(articles[article_id])
for sent_id, score in all_accs.items(): 
    print(f"{score:.2f}: {sentences[sent_id]}")

In [None]:
supervised_summarization(d=2200, M=model, p=7, l=0, o='rel', x_test=X_test, I = I)

In [None]:
ex = nltk.sent_tokenize(articles[486])
ex[26]

In [None]:
article_file_paths[486]

*B) Classification*

In [None]:
#code here

*C) Ranking extension*

In [None]:
#code here

*D) Evaluation*

In [None]:
#code, statistics and/or charts here

## Question materials (optional)

<H3>Part I: clustering</H3>

**(1)** Does clustering-guided summarization alter the behavior and efficacy of the IR system?

In [None]:
#code, statistics and/or charts here

**(2)** How sentence representations, clustering choices, and rank criteria impact summarization?

In [None]:
#code, statistics and/or charts here

**(3)** Are anchor sentences (capturing multiple topics) included? And less relevant outlier sentences excluded? 

In [None]:
#code, statistics and/or charts here

**(4)** Given a set of documents, plot the distribution of the number of keywords per document.
Are keywords generally dissimilar?

In [None]:
#code, statistics and/or charts here

<H3>END</H3>