##DOC2VEC


In [None]:
import pandas as pd
import tensorflow as tf
import sklearn
import numpy as np


np.random.seed(2023)
tf.random.set_seed(2023)

In [None]:
df = pd.read_csv('oversample_stemmed_train_df.csv')
df.head(3)

In [None]:
texts = df['prep_text']
texts

In [None]:
df = df.drop(['text', 'prep_text'], axis=1)
df

In [None]:
df.columns

In [None]:
df['mean'] = df.mean(axis=1)
df['mean']

In [None]:
df.head(2)

In [None]:
cols = list(df.columns)
cols.remove('mean')

In [None]:
pref_df = pd.DataFrame()

for col in cols:
  pref_df[col] = df[col] >= df['mean']

pref_df.astype(int)

In [None]:
print('Like PoI only if higher than mean.')
print(f'mean liked: {sum(pref_df.sum(axis=1))/2300}')
print(f'max liked: {max(pref_df.sum(axis=1))}')
print(f'min liked: {min(pref_df.sum(axis=1))}')

In [None]:
df['median'] = df.median(axis=1)
df.head(2)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

doc2vec_model = Doc2Vec.load('doc2vec_model_wibit')

text_vectors = [doc2vec_model.infer_vector(text.split(' ')) for text in texts]

In [None]:
text_vectors[1]

In [None]:
X = pd.DataFrame(text_vectors)
X.head(3)

In [None]:
y = pref_df.astype(int)
y.head(3)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


INPUT_SHAPE = 100


model = Sequential([
    Dense(96, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(53, activation='sigmoid')
])


model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
result = model.fit(X, y, epochs=20, batch_size=32, validation_split=0.15)

In [None]:
pred_Y = model.predict([[0 for i in range(100)]])

In [None]:
pred_Y[0]

In [None]:
def get_attr_from_vector(vector, threshold=0.5):
  return [1 if elem >= threshold else 0 for elem in vector]


In [None]:
np.array(get_attr_from_vector(pred_Y[0]))

In [None]:
test_df = pd.read_csv('test_df.csv')
test_df.head(3)

In [None]:
! pip install stop_words pyMorfologik

In [None]:
import re
import string
from stop_words import get_stop_words
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser


parser = ListParser()
stemmer = Morfologik()

stopwords_pl = get_stop_words("pl")


def preprocess_text(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    new_text = text.translate(translator)
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.strip()
    new_text = new_text.lower()

    stems = stemmer.stem([new_text], parser)
    tokens = [(list(stems[i][1].keys())[0] if len(list(stems[i][1].keys())) > 0 else stems[i][0]) for i in range(len(stems))]

    filtered_tokens = [token for token in tokens if token not in stopwords_pl]
    filtered_tokens = [token for token in filtered_tokens if token!= '']
    processed_text = " ".join(filtered_tokens)

    return processed_text


In [None]:
test_texts = [preprocess_text(text) for text in test_df['text']]
test_texts[:3]

In [None]:
y_test = test_df.drop(['text', 'date'], axis=1)
y_test.head(3)

In [None]:
y_test['mean'] = y_test.mean(axis=1)

In [None]:
y_test_scaled = pd.DataFrame()

for col in cols:
  y_test_scaled[col] = y_test[col] >= y_test['mean']

y_test_scaled = y_test_scaled.astype(int)
y_test_scaled.head(3)

In [None]:
X_test = pd.DataFrame([doc2vec_model.infer_vector(text.split(' ')) for text in test_texts])
X_test.head(2)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector, threshold=0.5) for vector in y_pred])

In [None]:
y_pred_cat.columns = cols

In [None]:
y_pred_cat.head(3)

In [None]:
y_test_scaled.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test_scaled.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  doc2vec_simple_nn_accuracy = total_ac/size
  doc2vec_simple_nn_recall = total_rec/size
  doc2vec_simple_nn_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test_scaled[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

## TF-IDF

In [None]:
import joblib

loaded_vectorizer = joblib.load('tfidf_vectorizer_wibit.joblib')
vectors = loaded_vectorizer.transform(texts)


In [None]:
print(vectors[0])

In [None]:
vectors.shape

In [None]:
vectors.toarray()

In [None]:
X = vectors.toarray()

In [None]:
INPUT_SHAPE = 2696

model = Sequential([
    Dense(512, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(53, activation='sigmoid')
])


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
result = model.fit(X, y, epochs=20, batch_size=32, validation_split=0.15)

In [None]:
X_test = loaded_vectorizer.transform(test_texts)
X_test = X_test.toarray()
X_test[:3]

In [None]:
y_pred = model.predict(X_test)
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector, threshold=0.5) for vector in y_pred])
y_pred_cat.columns = cols
y_pred_cat.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test_scaled.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  tfidf_simple_nn_accuracy = total_ac/size
  tfidf_simple_nn_recall = total_rec/size
  tfidf_simple_nn_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test_scaled[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam


INPUT_SHAPE = 2696

model = Sequential([
    Dense(1024, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dropout(0.3),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(53, activation='sigmoid')
])


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
result = model.fit(X, y, epochs=20, batch_size=32)

In [None]:
# from tensorflow.keras.models import save_model, load_model

# model.save('tfidf_bigger_nn')

In [None]:
# ! zip -r tfidf_bigger_nn.zip tfidf_bigger_nn/

In [None]:
# model = load_model('tfidf_bigger_nn')

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
print(get_attr_from_vector(y_pred[0], threshold=0.5))

In [None]:
y_pred = model.predict(X_test)
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector, threshold=0.5) for vector in y_pred])
y_pred_cat.columns = cols
y_pred_cat.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test_scaled.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  tfidf_bigger_nn_accuracy = total_ac/size
  tfidf_bigger_nn_recall = total_rec/size
  tfidf_bigger_nn_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test_scaled[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

In [None]:
# new_text = "Bardzo lubię zwiedzać wszelkiego rodzaju zamki, lochy i krypty. Uwielbiam też różnego rodzaju opuszczone miejsca i ruiny. W wolnym czasie chętnie jeżdzę na rolkach i słucham popu. Posiłki najbardziej lubię jeść w swoim domu, ale czasami jadam też w kfc. Interesuję się piłką nożna i lekkoatletyką"
# test_texts = [new_text]
# X_test = loaded_vectorizer.transform(test_texts)
# X_test = X_test.toarray()

## GradientBoosting

In [None]:
y

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=5, random_state=2023).fit(X, y['amusement_parks'])
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=2023).fit(X, y['historic_architecture'])

In [None]:
y_amusements_park = clf.predict(X_test)

In [None]:
y_amusements_park

In [None]:
y_pred_category = clf.predict(X_test)
accuracy_score(y_pred_category, y_test_scaled['historic_architecture'])

In [None]:
poi_clf = {}

for poi_type in cols:
  clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=5, random_state=2023).fit(X, y[poi_type])
  poi_clf[poi_type] = clf

In [None]:
def predict_with_poi_clf(poi_clf, poi_categories, X):
  results = {}
  for category in poi_categories:
    results[category] = poi_clf[category].predict(X)

  return pd.DataFrame(results)

In [None]:
y_pred_cat = predict_with_poi_clf(poi_clf, cols, X_test)
y_pred_cat.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test_scaled.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  tfidf_gb_accuracy = total_ac/size
  tfidf_gb_recall = total_rec/size
  tfidf_gb_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test_scaled[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

In [None]:
tested_options_acc = {
    'doc2vec_simple_nn_accuracy': doc2vec_simple_nn_accuracy,
    'tfidf_simple_nn_accuracy': tfidf_simple_nn_accuracy,
    'tfidf_bigger_nn_accuracy': tfidf_bigger_nn_accuracy,
    'tfidf_gb_accuracy': tfidf_gb_accuracy,
}

tested_options_acc

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

keys = ['doc2vec,\nmała sieć\nneuronowa', 'TF-IDF,\nmała sieć\nneuronowa', 'TF-IDF,\nwiększa sieć\nneuronowa', 'TF-IDF,\nGradientBoosting\nwiele klasyfikatorów\n']
values = [val*100 for val in list(tested_options_acc.values())]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Dokładność (Accuracy) na zbiorze testowym [%]')
plt.title('Dokładność (Accuracy) predykcji dokonanych za pomocą różnych modeli')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_accuracy.png')
plt.show()

In [None]:
tested_options_recall = {
    'doc2vec_simple_nn_recall': doc2vec_simple_nn_recall,
    'tfidf_simple_nn_recall': tfidf_simple_nn_recall,
    'tfidf_bigger_nn_recall': tfidf_bigger_nn_recall,
    'tfidf_gb_recall': tfidf_gb_recall,
}

tested_options_recall

In [None]:
values = [val*100 for val in list(tested_options_recall.values())]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Czułość (Recall) na zbiorze testowym [%]')
plt.title('Czułość (Recall) predykcji dokonanych za pomocą różnych modeli')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_recall.png')
plt.show()

In [None]:
tested_options_precision = {
    'doc2vec_simple_nn_precision': doc2vec_simple_nn_precision,
    'tfidf_simple_nn_precision': tfidf_simple_nn_precision,
    'tfidf_bigger_nn_precision': tfidf_bigger_nn_precision,
    'tfidf_gb_precision': tfidf_gb_precision,
}

tested_options_precision

In [None]:
values = [val*100 for val in list(tested_options_precision.values())]


plt.figure(figsize=(9, 6))
bars = plt.bar(keys, values, color='teal')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Precyzja (Precision) na zbiorze testowym [%]')
plt.title('Precyzja (Precision) predykcji dokonanych za pomocą różnych modeli')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_precision.png')
plt.show()