##DOC2VEC


In [None]:
import pandas as pd
import tensorflow as tf
import sklearn
import numpy as np


np.random.seed(2023)
tf.random.set_seed(2023)

In [None]:
df = pd.read_csv('oversample_stemmed_train_df.csv')
df.head(3)

In [None]:
texts = df['prep_text']
texts

In [None]:
df = df.drop(['text', 'prep_text'], axis=1)
df

In [None]:
df.columns

In [None]:
df['mean'] = df.mean(axis=1)
df['mean']

In [None]:
df.head(2)

In [None]:
cols = list(df.columns)
cols.remove('mean')

In [None]:
pref_df = pd.DataFrame()

for col in cols:
  pref_df[col] = df[col] >= df['mean']

pref_df.astype(int)

In [None]:
test_df = pd.read_csv('test_df.csv')
test_df.head(3)

In [None]:
! pip install stop_words

In [None]:
! pip install pyMorfologik

In [None]:
import re
from string import punctuation
import nltk
import spacy
from stop_words import get_stop_words
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser
import string


parser = ListParser()
stemmer = Morfologik()

stopwords_pl = get_stop_words("pl")


def preprocess_text(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    new_text = text.translate(translator)
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.strip()
    new_text = new_text.lower()

    stems = stemmer.stem([new_text], parser)
    tokens = [(list(stems[i][1].keys())[0] if len(list(stems[i][1].keys())) > 0 else stems[i][0]) for i in range(len(stems))]

    filtered_tokens = [token for token in tokens if token not in stopwords_pl]
    filtered_tokens = [token for token in filtered_tokens if token!= '']
    processed_text = " ".join(filtered_tokens)

    return processed_text


In [None]:
test_texts = [preprocess_text(text) for text in test_df['text']]
test_texts[:3]

In [None]:
y_test = test_df.drop(['text', 'date'], axis=1)
y_test.head(3)

In [None]:
y_test['mean'] = y_test.mean(axis=1)

In [None]:
y_test_scaled = pd.DataFrame()

for col in cols:
  y_test_scaled[col] = y_test[col] >= y_test['mean']

y_test_scaled = y_test_scaled.astype(int)
y_test_scaled.head(3)

## TF-IDF

In [None]:
import joblib

loaded_vectorizer = joblib.load('tfidf_vectorizer_wibit.joblib')
vectors = loaded_vectorizer.transform(texts)


In [None]:
print(vectors[0])

In [None]:
vectors.shape

In [None]:
vectors.toarray()

In [None]:
X = vectors.toarray()

In [None]:
y = pref_df.astype(int)
y.head(3)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.optimizers import Adam


INPUT_SHAPE = 2696

model = Sequential([
    Dense(1024, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dropout(0.3),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(53, activation='sigmoid')
])


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
result = model.fit(X, y, epochs=20, batch_size=32)

In [None]:
from keras.models import save_model, load_model

model.save('tfidf_bigger_nn')
# model.save('tfidf_bigger_nn.keras')

In [None]:
# code below allows to make zip from model and download it easily

In [None]:
 ! zip -r tfidf_bigger_nn.zip tfidf_bigger_nn/

In [None]:
model = load_model('tfidf_bigger_nn')

In [None]:
new_text = "Bardzo lubię zwiedzać wszelkiego rodzaju zamki, lochy i krypty. Uwielbiam też różnego rodzaju opuszczone miejsca i ruiny. W wolnym czasie chętnie jeżdzę na rolkach i słucham popu. Posiłki najbardziej lubię jeść w swoim domu, ale czasami jadam też w kfc. Interesuję się piłką nożna i lekkoatletyką"
test_texts = [new_text]
X_test = loaded_vectorizer.transform(test_texts)
X_test = X_test.toarray()

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
def get_attr_from_vector(vector, threshold=0.5):
  return [1 if elem >= threshold else 0 for elem in vector]

In [None]:
print(get_attr_from_vector(y_pred[0], threshold=0.5))

In [None]:
y_pred = model.predict(X_test)
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector, threshold=0.5) for vector in y_pred])
y_pred_cat.columns = cols
y_pred_cat.head(3)