In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# minimal packages to import
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
import string
import re
from tqdm import tqdm
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from nltk.tokenize import word_tokenize
import regex as re
import requests


# setting the filepath for the dataset, depending on where you saved them
scraping_courses = "/content/drive/MyDrive/MyDatasetFolder/your_file.csv"


In [5]:
# Load the dataframe
df_courses = pd.read_csv(scraping_courses)


In [6]:
# 2.0.0 Preprocess text: Stemming

df_courses['desc_lower'] = df_courses['Description'].str.lower() # Used lower because It didn't filter the word 'the' afterwords in the stop words

snowstem = snowball.SnowballStemmer('english')
df_courses['descr_stem'] = df_courses.desc_lower.apply(lambda row: [snowstem.stem(word) for word in row.split(' ')])
lst_snow = [snowstem.stem(word) for word in df_courses.loc[0,'desc_lower'].split(' ')]

In [7]:
# 2.0.0 Preprocess text: Stop Words

nltk.download('stopwords')
lst_stopwords = set(stopwords.words('english'))
df_courses['descr_clean'] = df_courses.desc_lower.apply(lambda row: [snowstem.stem(word) for word in row.split(' ') if not word.lower() in lst_stopwords])
df_courses.loc[0, ['descr_clean', 'descr_stem']]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


descr_clean    [3d, visualis, anim, play, role, mani, areas,,...
descr_stem     [3d, visualis, and, anim, play, a, role, in, m...
Name: 0, dtype: object

In [8]:
# 2.0.0 Preprocess text: Punctuation

nltk.download('punkt')
df_courses.loc[0, 'desc_lower']
words = nltk.word_tokenize(df_courses.loc[0, 'desc_lower'])
df_courses['descr_clean'] = df_courses.desc_lower.apply(lambda row: [snowstem.stem(word) for word in nltk.word_tokenize(row) if not word in lst_stopwords and word.isalnum()])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
# Where GBP is the base currency you want to use
# Store API Key in another file
url = 'https://v6.exchangerate-api.com/v6/ca6656c9b54065b55c44b78b/latest/GBP'

# Making our request
response = requests.get(url)
data_exchange = response.json()

with open('data_exchange.json', 'w') as json_file:
    json.dump(data_exchange, json_file)

# Read the JSON data from the file
with open('data_exchange.json', 'r') as json_file:
    loaded_data = json.load(json_file)

# Convert the 'result' column to a Pandas DataFrame
df = pd.DataFrame(loaded_data)

# Extract only the 'conversion_rates' column
conversion_rates = df['conversion_rates']


In [14]:
# 2.0.1 Preprocessing the fees column

# Create three different columns for the fees
df_courses['fees_float'] = 0
df_courses['currency'] = ''
df_courses['fees (£)'] = float(0)

# Defined a function to select the maximum fee (as stated in the exercise)
def max_value_fee(list_fees):
    filtered_fee_list = [int(fee) for fee in list_fees]
    if filtered_fee_list:
        max_fee = max(filtered_fee_list)
        max_fee = float(max_fee)
    else:
        max_fee = None
    return max_fee

for index, row in df_courses.iterrows():
    i = str(row['Fees'])
    no_punctuation = i.replace(".00 ", '').replace(".0 ", '').replace('.', '').replace(',', '').replace("'", '').replace(" ", '')
    matches_cur = re.findall(r'\p{Sc}|euro(?:s)?|dollar(?:s)?|pound(?:s)?|EUR(?:s)?|USD(?:s)?|CHF(?:s)?|SEK(?:s)?|ISK(?:s)?|RMB(?:s)?|QR(?:s)?|GBP(?:s)?|JPY(?:s)?', no_punctuation, flags=re.IGNORECASE)
    matches_fee = re.findall(r'(?!2021|2022|2023|2024)\d{4,}', no_punctuation)
    df_courses.at[index, 'fees_float'] = max_value_fee(matches_fee)
    if matches_cur:
      if matches_cur[0] == 'euro' or matches_cur[0] == 'euros' or matches_cur[0] == '€' or matches_cur[0] == 'EURs' or matches_cur[0] == 'Eur' or matches_cur[0] == 'Euros' or matches_cur[0] == 'EUROS' or matches_cur[0] == 'eurs'or matches_cur[0] == 'Euro':
        df_courses.at[index, 'currency'] = 'EUR'
      elif matches_cur[0] == 'dollar' or matches_cur[0] == 'dollars' or matches_cur[0] == '$':
        df_courses.at[index, 'currency'] = 'USD'
      elif matches_cur[0] == 'pound' or matches_cur[0] == 'pounds' or matches_cur[0] == '£':
        df_courses.at[index, 'currency'] = 'GBP'
      else:
        df_courses.at[index, 'currency'] = matches_cur[0]
    else:
      df_courses.at[index, 'currency'] = None

df_courses['fees (£)'] = df_courses.apply(lambda row: round(row['fees_float'] / conversion_rates.get(row['currency'], 1.0), 2), axis=1)

filtered_df = df_courses[(df_courses['fees_float'].notnull()) & (df_courses['currency'].notnull()) & (df_courses['fees (£)'].notnull())]
filtered_df


Unnamed: 0,Course Name,University Name,Faculty Name,Description,Fees,Modality,Duration,City,Country,Link,administration,desc_lower,descr_stem,descr_clean,fees_float,currency,fees (£)
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Businesses and governments rely on sound finan...,"UK: £18,000 (Total) International: £34,750 (To...",MSc,1 year full time,Leeds,United Kingdom,https://www.findamasters.com/masters-degrees/c...,On Campus,businesses and governments rely on sound finan...,"[busi, and, govern, reli, on, sound, financi, ...","[busi, govern, reli, sound, financi, knowledg,...",34750.0,GBP,34750.00
5,Advanced Chemical Engineering - MSc,University of Leeds,School of Chemical and Process Engineering,The Advanced Chemical Engineering MSc at Leeds...,"UK: £13,750 (Total) International: £31,000 (To...",MSc,1 year full time,Leeds,United Kingdom,https://www.findamasters.com/masters-degrees/c...,On Campus,the advanced chemical engineering msc at leeds...,"[the, advanc, chemic, engin, msc, at, leed, wi...","[advanc, chemic, engin, msc, leed, build, core...",31000.0,GBP,31000.00
7,Agricultural Sciences - MSc (Agriculture and F...,University of Helsinki,International Masters Degree Programmes,Goal of the pro­grammeWould you like to be inv...,Tuition fee per year (non-EU/EEA students): 15...,MSc,2 years,Helsinki,Finland,https://www.findamasters.com/masters-degrees/c...,On Campus,goal of the pro­grammewould you like to be inv...,"[goal, of, the, pro­grammewould, you, like, to...","[goal, like, involv, find, solut, futur, chall...",15000.0,EUR,13141.76
8,"Agricultural, Environmental and Resource Econo...",University of Helsinki,International Masters Degree Programmes,Goal of the pro­grammeAre you looking forward ...,Tuition fee per year (non-EU/EEA students): 15...,MSc,2 years,Helsinki,Finland,https://www.findamasters.com/masters-degrees/c...,On Campus,goal of the pro­grammeare you looking forward ...,"[goal, of, the, pro­grammear, you, look, forwa...","[goal, look, forward, futur, expert, agricultu...",15000.0,EUR,13141.76
9,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Up to 7 million people are estimated to die ev...,"UK: £12,500 (Total) International: £28,750 (To...",MSc,"1 year full time, 2 or 3 years part-time",Leeds,United Kingdom,https://www.findamasters.com/masters-degrees/c...,On Campus,up to 7 million people are estimated to die ev...,"[up, to, 7, million, peopl, are, estim, to, di...","[7, million, peopl, estim, die, everi, year, d...",28750.0,GBP,28750.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5983,Master's of Financial Technology (Fintech),Harbour.Space University,Masters Programmes,Harbour.Space's FinTech Master programme is de...,"€29,900/year","MBA, MSc",1 Year,Barcelona,Spain,https://www.findamasters.com/masters-degrees/c...,On Campus,harbour.space's fintech master programme is de...,"[harbour.spac, fintech, master, programm, is, ...","[fintech, master, programm, design, prepar, gr...",29900.0,EUR,26195.90
5984,Master's of Front-end Development,Harbour.Space University,Masters Programmes,Front-end Development at Harbour.Space Univers...,"€29,900/year",MSc,1 year,Barcelona,Spain,https://www.findamasters.com/masters-degrees/c...,On Campus,front-end development at harbour.space univers...,"[front-end, develop, at harbour.spac, universi...","[develop, univers, provid, uniqu, environ, stu...",29900.0,EUR,26195.90
5992,Materials and Molecular Modelling MSc,University College London,Department of Chemistry,Register your interest in graduate study at UC...,"Full time - £14,100",MSc,1 year full time,London,United Kingdom,https://www.findamasters.com/masters-degrees/c...,On Campus,register your interest in graduate study at uc...,"[regist, your, interest, in, graduat, studi, a...","[regist, interest, graduat, studi, uclther, gr...",14100.0,GBP,14100.00
5995,Materials Engineering,University of Padua,School of Engineering,The Master's degree Materials Engineering is a...,Our tuition fees will not exceed 2700 euros pe...,MSc,2 years,Padua,Italy,https://www.findamasters.com/masters-degrees/c...,On Campus,the master's degree materials engineering is a...,"[the, master, degre, materi, engin, is, an, in...","[master, degre, materi, engin, interdisciplina...",2700.0,EUR,2365.52


In [None]:
############################################### DON'T RUN #######################################################
# 2.1.1 Create your Index
# Create Vocabulary first

# Create list of all words
words = set()
df_courses.descr_clean.apply(lambda row: [words.add(word) for word in row])

# Assign term_id to each word by creating an index
vocabulary = {}
unique_id = 1
for word in list(words):
  vocabulary[unique_id] = word
  unique_id+=1

# Save this file into a pickle file which I can later on retrieve
with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'wb') as pickle_file:
    pickle.dump(vocabulary, pickle_file)


In [None]:
############################################### DON'T RUN #######################################################
# 2.1.1 Create Inverted Index

with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'rb') as pickle_file:
    vocabulary = pickle.load(pickle_file)

inverted_index = dict()
overall_progress = tqdm(total=len(list(vocabulary.keys())), desc="Building Vocabulary")

# Loop through the first 10 keys of the vocabulary
for i in list(vocabulary.keys()):
    inverted_index[i] = []
    for index, row in df_courses.iterrows():
        if vocabulary[i] in row['descr_clean']:
            inverted_index[i].append(index)

    overall_progress.update(1)  # Update the overall progress

overall_progress.close()

print(inverted_index)

with open('/content/drive/MyDrive/MyDatasetFolder/inverted_indexl.pkl', 'wb') as file:
        pickle.dump(inverted_index, file)

print(f"Vocabulary saved to /content/drive/MyDrive/MyDatasetFolder/inverted_index.pkl")


Building Vocabulary: 100%|██████████| 8753/8753 [49:08<00:00,  2.97it/s]

{1: [63, 260, 261, 281, 321, 454, 472, 1109, 1111, 1172, 1854, 2311, 2435, 2441, 2442, 2445, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2456, 2459, 2460, 2461, 2462, 2464, 2465, 2466, 2467, 2468, 2469, 2474, 2475, 2476, 2477, 2479, 2595, 2753, 2856, 2921, 3027, 3084, 3085, 3092, 3095, 3377, 3466, 3852, 3853, 3864, 3870, 3874, 4024, 4038, 4039, 4049, 4068, 4070, 4106, 4119, 4351, 4481, 4743, 4756, 4758, 4912, 4922, 4934, 5056, 5077, 5520, 5577, 5787], 2: [5334], 3: [246, 902], 4: [1004, 4305, 4313], 5: [5017], 6: [25], 7: [195, 1290, 1387, 1703, 2398, 3302, 3308, 3635, 3664, 4119, 4134, 4201, 4576, 5089, 5269, 5429], 8: [2925, 3217, 4070, 4587], 9: [1382, 4986], 10: [42, 59, 352, 362, 450, 588, 755, 778, 854, 925, 1060, 1329, 1453, 1471, 1743, 1745, 1748, 1817, 1884, 2062, 2113, 2247, 2409, 2722, 3146, 3208, 3758, 3949, 4266, 4278, 4317, 4428, 4602, 5002, 5035, 5039, 5048, 5092, 5203, 5230, 5605, 5728, 5740, 5884, 5933, 5954], 11: [80, 1318, 1457, 1581, 1742, 2079, 2080, 2081, 2082




In [16]:
# This allows to retrieve the term_id from a word
def find_key_by_value(vocabulary, value):
  for key, val in vocabulary.items():
    if val == value:
      return key
  return None


def search_engine_with_vocabulary():
  query = input()
  # Load inverted index and vocabulary
  with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index.pkl', 'rb') as pickle_file:
      inverted_index = pickle.load(pickle_file)

  with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'rb') as vocab_file:
      vocabulary = pickle.load(vocab_file)

  # Preprocess the query in order to compare it with desc_clean
  # Tokenize and preprocess the query
  query = ''.join(char for char in query if char.isalnum() or char.isspace())
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(query)
  query = ' '.join([word for word in words if word.lower() not in stop_words])

  # Stemming using SnowballStemmer
  snowstem = SnowballStemmer('english')
  query_words = [snowstem.stem(word) for word in query.lower().split()]

  # Initialize a list to store matching documents
  matching_documents = []

  # Find documents that contain all the words in the query
  for stemmed_word in query_words:
      if stemmed_word in vocabulary.values():
        term_id = find_key_by_value(vocabulary, stemmed_word)
        matching_documents.append(set(inverted_index[term_id])) # If found in vocabulary it appends the the list of documents to matching_documents. I used a set to then use the intersection function.

  if matching_documents:
      common_documents = set.intersection(*matching_documents) # Using set intersection I can find the common_documents (documents that contain all of the words)

  df_result = df_courses.loc[list(common_documents), ['Course Name', 'University Name', 'Description', 'Link']].copy()

  # Display the DataFrame
  return df_result

# Example usage
search_engine_with_vocabulary()


DANCE


Unnamed: 0,Course Name,University Name,Description,Link
2568,Dance Science MSc,University of Chichester,This suite of MSc programmes is designed for s...,https://www.findamasters.com/masters-degrees/c...
2567,"Dance Science and Education MSc, PgDip (ICL), ...",University of Edinburgh,Programme descriptionIf you are interested in ...,https://www.findamasters.com/masters-degrees/c...


In [None]:
# 2.2 Conjunctive query & Ranking score

# Build tfidf_data
tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
results = tfidf.fit_transform(df_courses.descr_clean)
result_dense = results.todense()
tfidf_data = pd.DataFrame(result_dense.tolist(), index=df_courses.index, columns=tfidf.get_feature_names_out())




In [None]:
############################################### DON'T RUN #######################################################
# 2.2.1 New Inverted index

inverted_index_2 = dict()
overall_progress = tqdm(total=len(list(inverted_index.keys())), desc="Building Inverted Index 2")

for term_id in list(inverted_index.keys()):
  values = tfidf_data.loc[tfidf_data[vocabulary[term_id]] > 0, [vocabulary[term_id]]]
  term = vocabulary[term_id]
  values_list = list(zip(values.index, values[term]))
  inverted_index_2[term_id] = values_list
  overall_progress.update(1)  # Update the overall progress

overall_progress.close()

print(inverted_index_2)

with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl', 'wb') as file:
        pickle.dump(inverted_index_2, file)

print(f"Vocabulary saved to /content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl")


Building Inverted Index 2: 100%|██████████| 8753/8753 [00:35<00:00, 245.76it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# def search_engine_with_vocabulary(query):
#     # Load inverted index and vocabulary
#     with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl', 'rb') as pickle_file:
#         inverted_index_2 = pickle.load(pickle_file)

#     with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'rb') as vocab_file:
#         vocabulary = pickle.load(vocab_file)

#     # Tokenize and preprocess the query
#     query = ''.join(char for char in query if char.isalnum() or char.isspace())
#     stop_words = set(stopwords.words('english'))
#     words = word_tokenize(query)
#     query = ' '.join([word for word in words if word.lower() not in stop_words])
#     # Stemming using SnowballStemmer
#     snowstem = SnowballStemmer('english')
#     query = ' '.join(snowstem.stem(word) for word in query.lower().split())


def search_engine_with_vocabulary(query):
    # Load inverted index and vocabulary
    with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index_final.pkl', 'rb') as pickle_file:
        inverted_index = pickle.load(pickle_file)

    with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl', 'rb') as pickle_file:
        inverted_index_2 = pickle.load(pickle_file)

    with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'rb') as vocab_file:
        vocabulary = pickle.load(vocab_file)

    # Tokenize and preprocess the query
    query = ''.join(char for char in query if char.isalnum() or char.isspace())
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(query)
    query = ' '.join([word for word in words if word.lower() not in stop_words])

    # Stemming using SnowballStemmer
    snowstem = SnowballStemmer('english')
    query_words = [snowstem.stem(word) for word in query.lower().split()]

    # Initialize a list to store matching documents
    matching_documents = []

    # Find documents that contain all the words in the query
    for stemmed_word in query_words:
        if stemmed_word in vocabulary.values():
          term_id = find_key_by_value(vocabulary, stemmed_word)
          matching_documents.append(set(inverted_index[term_id]))

    if matching_documents:
        common_documents = set.intersection(*matching_documents)

    similarity_list = {}

    for i in common_documents:
      similarity_list[i] = []
      for word in query_words:
        term_id = find_key_by_value(vocabulary, word)
        for doc, tfidf_value in inverted_index_2[term_id]:
          if doc == i:
            # Append the TF-IDF value to the result vector
            similarity_list[i].append(tfidf_value)

    print(query_vector)

    # df_result = df_courses.loc[list(common_documents), ['Course Name', 'University Name', 'Description', 'Link']].copy()

# Example usage
search_engine_with_vocabulary('economics and central banks')


ValueError: ignored

In [None]:
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_key_by_value(dictionary, value):
    # Helper function to find a key in a dictionary based on its value
    return next(key for key, val in dictionary.items() if val == value)

def search_engine_with_vocabulary(query):
    # Load inverted index and vocabulary
    with open('/content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl', 'rb') as pickle_file:
        inverted_index_2 = pickle.load(pickle_file)

    with open('/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl', 'rb') as vocab_file:
        vocabulary = pickle.load(vocab_file)

    # Tokenize and preprocess the query
    query = ''.join(char for char in query if char.isalnum() or char.isspace())
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(query)
    query = ' '.join([word for word in words if word.lower() not in stop_words])

    # Stemming using SnowballStemmer
    snowstem = SnowballStemmer('english')
    query_words = [snowstem.stem(word) for word in query.lower().split()]

    # Initialize a list to store matching documents
    matching_documents = set()

    # Find documents that contain all the words in the query
    for stemmed_word in query_words:
        if stemmed_word in vocabulary.values():
            term_id = find_key_by_value(vocabulary, stemmed_word)
            matching_documents.update(inverted_index_2.get(term_id, set()))

    # Extract document texts for vectorization
    documents = [doc[0] for doc_list in inverted_index_2.values() for doc in doc_list]

    # Calculate TF-IDF vectors for documents
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Calculate TF-IDF vector for the query
    query_vector = tfidf_vectorizer.transform([query])

    # Calculate cosine similarity for each document
    similarity_scores = {}
    for doc_id in matching_documents:
        doc_vector = tfidf_matrix.getrow(doc_id)
        similarity_scores[doc_id] = cosine_similarity(query_vector, doc_vector)[0, 0]

    # Sort documents based on similarity scores
    sorted_documents = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Print or use the sorted_documents as needed
    print("Sorted Documents:", sorted_documents)

# Example usage
search_engine_with_vocabulary('economics and central banking')


AttributeError: ignored

In [264]:
def find_key_by_value(dictionary, value):
    return next(key for key, val in dictionary.items() if val == value)

# Assuming you have a DataFrame named df_courses containing your data
# For example: df_courses = pd.read_csv('your_dataset.csv')

def search_engine_with_vocabulary(query, inverted_index_path, inverted_index_2_path, vocabulary_path, df_courses):
    # Load inverted index and vocabulary
    with open(inverted_index_path, 'rb') as pickle_file:
        inverted_index = pickle.load(pickle_file)

    with open(inverted_index_2_path, 'rb') as pickle_file:
        inverted_index_2 = pickle.load(pickle_file)

    with open(vocabulary_path, 'rb') as vocab_file:
        vocabulary = pickle.load(vocab_file)

    # Tokenize and preprocess the query
    query = ''.join(char for char in query if char.isalnum() or char.isspace())
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(query)
    query = ' '.join([word for word in words if word.lower() not in stop_words])

    # Stemming using SnowballStemmer
    snowstem = SnowballStemmer('english')
    query_words = [snowstem.stem(word) for word in query.lower().split()]

    # Initialize a list to store matching documents
    matching_documents = []

    # Find documents that contain all the words in the query
    for stemmed_word in query_words:
        if stemmed_word in vocabulary.values():
          term_id = find_key_by_value(vocabulary, stemmed_word)
          matching_documents.append(set(inverted_index[term_id]))

    if matching_documents:
        common_documents = set.intersection(*matching_documents)

    # Create a list of document texts from df_courses based on row numbers in inverted_index_2
    document_texts = [df_courses.loc[doc_id, 'desc_clean'] for doc_id in common_documents]

    # Append the query to the list of document texts
    document_texts.append(query)

    # Use TfidfVectorizer to transform the document texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_texts)

    # Calculate cosine similarity between the query vector and document vectors
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    # Create a list of document IDs and their corresponding similarity scores
    document_ids = list(common_documents)
    similarity_scores = list(zip(document_ids, cosine_similarities))

    # Sort documents based on similarity scores in descending order
    sorted_documents = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Now sorted_documents contains document IDs and their corresponding similarity scores
    return sorted_documents

# Example usage
inverted_index_path = '/content/drive/MyDrive/MyDatasetFolder/inverted_index_final.pkl'
inverted_index_2_path = '/content/drive/MyDrive/MyDatasetFolder/inverted_index_2.pkl'
vocabulary_path = '/content/drive/MyDrive/MyDatasetFolder/vocabulary.pkl'

query_result = search_engine_with_vocabulary('economics and central banks', inverted_index_path, inverted_index_2_path, vocabulary_path, df_courses)
print(query_result)

KeyError: ignored