In [None]:
! pip install pymongo pyMorfologik stop_words

In [None]:
import re
import string
from stop_words import get_stop_words
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser


parser = ListParser()
stemmer = Morfologik()

stopwords_pl = get_stop_words("pl")


def preprocess_text(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    new_text = text.translate(translator)
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.strip()
    new_text = new_text.lower()

    stems = stemmer.stem([new_text], parser)
    tokens = [(list(stems[i][1].keys())[0] if len(list(stems[i][1].keys())) > 0 else stems[i][0]) for i in range(len(stems))]

    filtered_tokens = [token for token in tokens if token not in stopwords_pl]
    filtered_tokens = [token for token in filtered_tokens if token!= '']
    processed_text = " ".join(filtered_tokens)

    return processed_text


In [None]:
import pymongo

In [None]:
from pymongo import MongoClient
from pymongo.server_api import ServerApi


MONGODB_LOGIN = "project_user"
MONGODB_PASSWORD = "project_password"


uri = f"mongodb+srv://{MONGODB_LOGIN}:{MONGODB_PASSWORD}@wibit.4d0e5vs.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri, server_api=ServerApi('1'))
database = client["wibit"]
collection = database["wikipedia_descriptions"]

In [None]:
# result = collection.find()

# for document in result:
  # prep_content = preprocess_text(document['content'])
  # collection.update_one({'_id': document['_id']}, {'$set': {'prep_content': prep_content}})


In [None]:
def get_texts_from_category(category):
  texts = []
  result = collection.find({'kinds': {'$in': [category]}})
  for document in result:
    texts.append(document['prep_content'])
  return texts

In [None]:
def get_texts_corpus():
  texts = []
  result = collection.find()
  for document in result:
    texts.append(document['prep_content'])
  return texts

In [None]:
categories_list = ['ferris_wheels', 'winter_sports', 'hindu_temples', 'archaeology', 'tumuluses', 'biographical_museums',
                   'fashion_museums', 'amusement_parks', 'water_parks', 'miniature_parks', 'baths_and_saunas', 'climbing',
                   'stadiums', 'natural_springs', 'water', 'nature_reserves', 'beaches', 'railway_stations', 'dams', 'mints',
                   'mineshafts', 'science_museums', 'churches', 'cathedrals', 'monasteries', 'synagogues', 'mosques', 'castles',
                   'fortified_towers', 'bunkers', 'military_museums', 'battlefields', 'war_graves', 'cemeteries', 'mausoleums',
                   'crypts', 'wall_painting', 'fountains', 'sculptures', 'gardens_and_parks', 'archaeological_museums',
                   'art_galleries', 'history_museums', 'local_museums', 'national_museums', 'planetariums', 'zoos', 'aquariums',
                   'skyscrapers', 'towers', 'historic_architecture', 'bridges', 'monuments']

categories_list

In [None]:
for category in categories_list:
  print(category, len(get_texts_from_category(category)))

In [None]:
corpus = get_texts_corpus()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_sizes = []

for i in range(1, 31):
  vectorizer = TfidfVectorizer(min_df=i, max_df=0.25)
  vectors = vectorizer.fit_transform(corpus)
  vec_sizes.append(vectors.shape[1])

vec_sizes

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')


plt.figure(figsize=(10, 8))
plt.plot(range(1, 31), vec_sizes, marker='o', linestyle='-')

idx_for_label = list(range(0, 6)) + list(range(7,12, 2)) + list(range(14, 30, 5))
labels = [vec_sizes[i] for i in idx_for_label]

for i in range(len(labels)):
    plt.annotate(labels[i], (idx_for_label[i]+1, labels[i]), textcoords="offset points", xytext=(5, 7), ha='center')

plt.xlabel('\nParametr min_df - \n minimalna liczba tekstów, w których występuje słowo, \n aby trafiło ono do korpusu')
plt.ylabel('Liczba słów w korpusie\n')
plt.title('Zależność liczby słów w korpusie od wartości parametru min_df')

plt.tight_layout(pad=1)
plt.savefig('min_df_tfidf.png')
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=4, max_df=0.25)
vectors = vectorizer.fit_transform(corpus)
print(vectors.shape)

In [None]:
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer_wibit_categories.joblib')

In [None]:
loaded_vectorizer = joblib.load('tfidf_vectorizer_wibit_categories.joblib')

In [None]:
vec1 = vectors[0]
vec2 = vectors[1]

type(vec1)

In [None]:
import scipy
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cos_sim

# minus is added because originally it only metrics in which lower is better
# now 0 is perfect, lowerhigher is better
def opposite_euclidean_distance(A: scipy.sparse._csr.csr_matrix, B:scipy.sparse._csr.csr_matrix):
  return -np.linalg.norm(A.toarray() - B.toarray())

# higher is better, perfect is 1
def cosine_similarity(A: scipy.sparse._csr.csr_matrix, B:scipy.sparse._csr.csr_matrix):
  return sklearn_cos_sim(A, B)[0][0]

def jaccard_index(A: scipy.sparse._csr.csr_matrix, B:scipy.sparse._csr.csr_matrix):
  non_zero_A = A.nonzero()[1]
  non_zero_B = B.nonzero()[1]
  intersection_AB = np.intersect1d(non_zero_A, non_zero_B)
  union_AB = np.union1d(non_zero_A, non_zero_B)
  return len(intersection_AB)/len(union_AB)

def pearson_correlation(A: scipy.sparse._csr.csr_matrix, B:scipy.sparse._csr.csr_matrix):
  return np.corrcoef(A.toarray(), B.toarray())[0][1]


In [None]:
A = vec1
B = vec2

In [None]:
opposite_euclidean_distance(A, A)

In [None]:
opposite_euclidean_distance(A, B)

In [None]:
cosine_similarity(A, A)

In [None]:
cosine_similarity(A, B)

In [None]:
jaccard_index(A, A)

In [None]:
jaccard_index(A, B)

In [None]:
pearson_correlation(A, A)

In [None]:
pearson_correlation(A, B)

In [None]:
print(vectorizer.transform(['turysta który uwielbiać odwiedzać zamek i różny rodzaj loch w szczególność sala tortura chętnie jadać stek w renomowany restauracja']))

In [None]:
def get_mean_category_vector(category, vectorizer):
  texts = get_texts_from_category(category)
  sparese_vectors = vectorizer.transform(texts)
  mean_vector = sparese_vectors.mean(axis=0)
  return mean_vector

In [None]:
get_mean_category_vector('water', vectorizer).shape

In [None]:
type(get_mean_category_vector('water', vectorizer))

In [None]:
test_cat_vector = get_mean_category_vector('water', vectorizer)

In [None]:
test_cat_vector.tolist()

In [None]:
import pandas as pd

df_dict = {}

for category in categories_list:
  df_dict[category] = get_mean_category_vector(category, vectorizer).tolist()[0]


In [None]:
df = pd.DataFrame(df_dict)

In [None]:
df

In [None]:
df.index = vectorizer.get_feature_names_out()

In [None]:
df

In [None]:
df.to_csv('tf_idf_categories.csv')

In [None]:
categories_df = df

In [None]:
from scipy.sparse import csr_matrix

def get_category_vector_from_df(df, category):
  return csr_matrix(df[category].values)


In [None]:
print(get_category_vector_from_df(categories_df, 'ferris_wheels'))

In [None]:
def get_text_categories(text, vectorizer, df, metrics_func, return_binary_vector=False):
  prep_text = preprocess_text(text)
  text_vector = vectorizer.transform([prep_text])
  categories = list(df.columns)
  calculated_metrics = {}

  for category in categories:
    category_vector = get_category_vector_from_df(df, category)
    calculated_metrics[category] = metrics_func(text_vector, category_vector)

  metrics_values = list(calculated_metrics.values())
  avg_metrics_val = sum(metrics_values)/len(metrics_values)
  bigget_than_avg_vector = [1 if val > avg_metrics_val else 0 for val in metrics_values]

  if return_binary_vector:
    return bigget_than_avg_vector

  liked_categories = []

  for i in range(len(bigget_than_avg_vector)):
    if bigget_than_avg_vector[i] > 0:
      liked_categories.append(categories[i])

  return liked_categories


In [None]:
# opposite_euclidean_distance, cosine_similarity, jaccard_index, pearson_correlation

In [None]:
get_text_categories('Chcę zobaczyć zamek', vectorizer, categories_df, pearson_correlation)

In [None]:
get_text_categories('Chcę zobaczyć zamek', vectorizer, categories_df, cosine_similarity)

In [None]:
test_df = pd.read_csv('test_df.csv')
test_df.head(3)

In [None]:
test_texts = test_df['text']
test_texts[0]

In [None]:
y_test = test_df.drop(['text', 'date'], axis=1)
y_test['mean'] = y_test.mean(axis=1)
y_test_scaled = pd.DataFrame()

for col in categories_list:
  y_test_scaled[col] = y_test[col] >= y_test['mean']

y_test_scaled = y_test_scaled.astype(int)
y_test_scaled.head(3)

In [None]:
len(y_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

accuracies = []
recalls = []
precisions = []

size = 26

for metrics in [opposite_euclidean_distance, cosine_similarity, jaccard_index, pearson_correlation]:
  print(f"\n__________{metrics.__name__}__________")
  total_ac = 0
  total_rec = 0
  total_prec = 0

  for i in range(len(y_test_scaled)):
    y_pred = get_text_categories(test_texts[i], vectorizer, categories_df, metrics, True)
    y_true = y_test_scaled.iloc[i].tolist()

    ac_score = accuracy_score(y_pred, y_true)
    rec_score = recall_score(y_pred, y_true)
    prec_score = precision_score(y_pred, y_true)

    total_ac += ac_score
    total_rec += rec_score
    total_prec += prec_score
    print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


  accuracies.append(total_ac/size)
  recalls.append(total_rec/size)
  precisions.append(total_prec/size)

  print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

  print(f"______________________________")


In [None]:
accuracies

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

keys = ['Metryka\neuklidesowa', 'Podobieństwo\ncosinusowe', 'Indeks\nJaccarda', 'Współczynnik\nkorelacji\nPearsona']
values = [val*100 for val in list(accuracies)]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Dokładność (Accuracy) na zbiorze testowym [%]')
plt.title('Dokładność (Accuracy) algorytmu porównującego wektory dla różnych metryk')

plt.tight_layout(pad=1)
plt.savefig('vector_comparison_accuracy.png')
plt.show()

In [None]:
values = [val*100 for val in list(recalls)]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Czułość (Recall) na zbiorze testowym [%]')
plt.title('Czułość (Recall) algorytmu porównującego wektory dla różnych metryk')

plt.tight_layout(pad=1)
plt.savefig('vector_comparison_recall.png')
plt.show()

In [None]:
values = [val*100 for val in list(precisions)]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Precyzja (Precision) na zbiorze testowym [%]')
plt.title('Precyzja (Precision) algorytmu porównującego wektory dla różnych metryk')

plt.tight_layout(pad=1)
plt.savefig('vector_comparison_precision.png')
plt.show()

In [None]:
user_input = "Lubię zwiedzać różnego rodzaju muzea, w szczególności te, które związane są z nauką i techniką. Jestem fanem lotnictwa i nowczesnej inżynierii. Chętnie gram w gry komputerowe i oglądam sport w telewizji."
get_text_categories(user_input, vectorizer, categories_df, cosine_similarity)

In [None]:
get_text_categories(user_input, vectorizer, categories_df, opposite_euclidean_distance)

In [None]:
def get_text_categories(text, vectorizer, df, metrics_func, return_binary_vector=False):
  prep_text = preprocess_text(text)
  text_vector = vectorizer.transform([prep_text])
  categories = list(df.columns)
  calculated_metrics = {}

  for category in categories:
    category_vector = get_category_vector_from_df(df, category)
    calculated_metrics[category] = metrics_func(text_vector, category_vector)

  metrics_values = list(calculated_metrics.values())
  avg_metrics_val = sum(metrics_values)/len(metrics_values)
  bigget_than_avg_vector = [1 if val > avg_metrics_val else 0 for val in metrics_values]

  if return_binary_vector:
    return bigget_than_avg_vector

  liked_categories = []

  for i in range(len(bigget_than_avg_vector)):
    if bigget_than_avg_vector[i] > 0:
      liked_categories.append(categories[i])

  return liked_categories


TOP N

In [None]:
def get_top_N_text_categories(text, vectorizer, df, metrics_func, n, return_binary_vector=False):
  prep_text = preprocess_text(text)
  text_vector = vectorizer.transform([prep_text])
  categories = list(df.columns)
  calculated_metrics = {}

  for category in categories:
    category_vector = get_category_vector_from_df(df, category)
    calculated_metrics[category] = metrics_func(text_vector, category_vector)

  metrics_values = list(calculated_metrics.values())
  threshold = (sorted(metrics_values))[-n]
  bigget_than_avg_vector = [1 if val >= threshold else 0 for val in metrics_values]

  if return_binary_vector:
    return bigget_than_avg_vector

  liked_categories = []

  for i in range(len(bigget_than_avg_vector)):
    if bigget_than_avg_vector[i] > 0:
      liked_categories.append(categories[i])

  return liked_categories

In [None]:
user_input = "Lubię zwiedzać różnego rodzaju muzea, w szczególności te, które związane są z nauką i techniką. Jestem fanem lotnictwa i nowczesnej inżynierii. Chętnie gram w gry komputerowe i oglądam sport w telewizji."
get_top_N_text_categories(user_input, vectorizer, categories_df, cosine_similarity, 10)

In [None]:
test_df.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

N = 10
size = 26

satisfactions = []


for metrics in [opposite_euclidean_distance, cosine_similarity, jaccard_index, pearson_correlation]:
  print(f"\n__________{metrics.__name__}__________")

  metrics_satisfaction = 0

  for i in range(size):
    points = 0
    current_row = test_df.iloc[i]
    max_n = sum(sorted(current_row[categories_list])[-N:])
    recommended_categories = get_top_N_text_categories(current_row['text'], vectorizer, categories_df, metrics, N)

    for category in recommended_categories[:N]:
      points += int(current_row[category])

    satisfaction = points/max_n
    metrics_satisfaction += satisfaction

    print(f"{i} - Satisfaction {satisfaction}")

  satisfactions.append(metrics_satisfaction/size)

  print(f"\nMean - Satisfaction {metrics_satisfaction/size}")

  print(f"______________________________")

In [None]:
values = [val*100 for val in satisfactions]

plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Satysfakcja użytkowników na zbiorze testowym [%]')
plt.title('Satysfakcja użytkowników z 10 kategorii rekomendowanych im przez algorytm')

plt.tight_layout(pad=1)
plt.savefig('vector_comparison_satisfaction_10.png')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

N = 15
size = 26

satisfactions = []


for metrics in [opposite_euclidean_distance, cosine_similarity, jaccard_index, pearson_correlation]:
  print(f"\n__________{metrics.__name__}__________")

  metrics_satisfaction = 0

  for i in range(size):
    points = 0
    current_row = test_df.iloc[i]
    max_n = sum(sorted(current_row[categories_list])[-N:])
    recommended_categories = get_top_N_text_categories(current_row['text'], vectorizer, categories_df, metrics, N)

    for category in recommended_categories[:N]:
      points += int(current_row[category])

    satisfaction = points/max_n
    metrics_satisfaction += satisfaction

    print(f"{i} - Satisfaction {satisfaction}")

  satisfactions.append(metrics_satisfaction/size)

  print(f"\nMean - Satisfaction {metrics_satisfaction/size}")

  print(f"______________________________")

In [None]:
values = [val*100 for val in satisfactions]

plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Satysfakcja użytkowników na zbiorze testowym [%]')
plt.title('Satysfakcja użytkowników z 5 kategorii rekomendowanych im przez algorytm')

plt.tight_layout(pad=1)
plt.savefig('vector_comparison_satisfaction_5.png')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

N = 15
size = 26

satisfactions = []


for metrics in [opposite_euclidean_distance, cosine_similarity, jaccard_index, pearson_correlation]:
  metrics_satisfaction = 0

  for i in range(size):
    points = 0
    current_row = test_df.iloc[i]
    max_n = sum(sorted(current_row[categories_list])[-N:])
    recommended_categories = get_top_N_text_categories(current_row['text'], vectorizer, categories_df, metrics, N)

    for category in recommended_categories[:N]:
      points += int(current_row[category])

    satisfaction = points/max_n
    metrics_satisfaction += satisfaction

  satisfactions.append(metrics_satisfaction/size)

  print(f"\{metrics.__name__} - Satisfaction {metrics_satisfaction/size}")


values = [val*100 for val in satisfactions]

plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Zastosowana metryka')
plt.ylabel('Satysfakcja użytkowników na zbiorze testowym [%]')
plt.title(f'Satysfakcja użytkowników z {N} kategorii rekomendowanych im przez algorytm')

plt.tight_layout(pad=1)
plt.savefig(f'vector_comparison_satisfaction_{N}.png')
plt.show()