In [None]:
import pandas as pd

categories_dict = {'Koła widokowe': 'ferris_wheels',
                  'Areny sportów zimowych': 'winter_sports',
                  'Świątynie hinduizmu': 'hindu_temples',
                  'Obiekty archeologiczne': 'archaeology',
                  'Kopce/kurchany': 'tumuluses',
                  'Muzea biograficzne': 'biographical_museums',
                  'Muzea modowe': 'fashion_museums',
                  'Parki rozrywki': 'amusement_parks',
                  'Parki wodne': 'water_parks',
                  'Parki miniatur': 'miniature_parks',
                  'Baseny, termy i sauny': 'baths_and_saunas',
                  'Ścianki wspinaczkowe': 'climbing',
                  'Stadiony': 'stadiums',
                  'Źródła': 'natural_springs',
                  'Rzeki, kanały, wodospady': 'water',
                  'Rezerwaty przyrody': 'nature_reserves',
                  'Plaże': 'beaches',
                  'Stacje kolejowe': 'railway_stations',
                  'Zapory': 'dams',
                  'Mennice': 'mints',
                  'Kopalnie': 'mineshafts',
                  'Muzea nauki i techniki': 'science_museums',
                  'Kościoły': 'churches',
                  'Katedry': 'cathedrals',
                  'Klasztory': 'monasteries',
                  'Synagogi': 'synagogues',
                  'Meczety': 'mosques',
                  'Zamki': 'castles',
                  'Wieże obronne': 'fortified_towers',
                  'Bunkry': 'bunkers',
                  'Muzea militarne': 'military_museums',
                  'Pola bitew': 'battlefields',
                  'Cmentarze wojenne': 'war_graves',
                  'Cmentarze': 'cemeteries',
                  'Mauzolea': 'mausoleums',
                  'Krypty': 'crypts',
                  'Murale': 'wall_painting',
                  'Fontanny': 'fountains',
                  'Rzeźby': 'sculptures',
                  'Zieleń miejska': 'gardens_and_parks',
                  'Muzea archeologiczne': 'archaeological_museums',
                  'Galerie sztuki': 'art_galleries',
                  'Muzea historyczne': 'history_museums',
                  'Muzea lokalne': 'local_museums',
                  'Muzea narodowe': 'national_museums',
                  'Planetaria': 'planetariums',
                  'Zoo': 'zoos',
                  'Akwaria': 'aquariums',
                  'Drapacze chmur': 'skyscrapers',
                  'Wieże (zegarowe, widokowe)': 'towers',
                  'Budynki historyczne': 'historic_architecture',
                  'Mosty': 'bridges',
                  'Pomniki': 'monuments'}

def read_form_result(csv_path):
  df = pd.read_csv(csv_path)
  df = df.drop(df.columns[-6:], axis=1)
  df = df.drop(df.columns[0], axis=1)
  cols = df.columns

  new_col_names = ['text', 'date']
  for col in cols[3:]:
    new_col_names_cat = [col.split('[')[-1][:-1] for col in cols[2:]]

  new_col_names += new_col_names_cat
  df.columns = [categories_dict[cat] if cat in categories_dict else cat for cat in new_col_names]

  return df

In [None]:
df = read_form_result('./wibit_form.csv')
df

In [None]:
! pip install stop_words

In [None]:
import re
from string import punctuation
import nltk
import spacy
nltk.download('stopwords')
from nltk.corpus import stopwords
from stop_words import get_stop_words

stopwords_pl = get_stop_words("pl")

In [None]:
def preprocess_text(text):
    translator = str.maketrans("", "", punctuation)
    new_text = text.translate(translator)
    new_text = re.sub(r'\d+', '', new_text)
    new_text = new_text.lower()
    new_text = re.sub(r'\s+', ' ', new_text)
    tokens = new_text.split(' ')

    filtered_tokens = [token for token in tokens if token not in stopwords_pl]
    filtered_tokens = [token for token in filtered_tokens if token!= '']
    processed_text = " ".join(filtered_tokens)

    return processed_text

In [None]:
preprocess_text(df['text'][1])

In [None]:
stopwords_pl

In [None]:
! python -m spacy download pl_core_news_sm

In [None]:
import torch
import spacy

spcay_nlp = spacy.load("pl_core_news_sm")

In [None]:
pre_prep_texts = pd.Series([preprocess_text(df['text'][i]) for i in range(len(df))])
pre_prep_texts[:3]

In [None]:
texts_spacy = pre_prep_texts.map(spcay_nlp)

lemmatized_texts = texts_spacy.map(lambda doc: [t.lemma_ for t in doc])
lemmatized_texts[:3]

In [None]:
lemmatized_texts[1]

In [None]:
spacy_vectors = texts_spacy.map(lambda doc: doc.vector)
spacy_vectors[:3]

In [None]:
len(spacy_vectors[0])

In [None]:
len(spacy_vectors[1])

In [None]:
spacy_vectors[1]

In [None]:
# spacy returns 96 element vectors of texts - might be useful in future tasks

In [None]:
! pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator

translator = Translator()

df['text_en'] = [translator.translate(sentence, src='pl', dest='en').text for sentence in df['text']]

In [None]:
df[['text', 'text_en']].head(5)

In [None]:
# https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base
# in colab check if GPU environmet is checked

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [None]:
paraphrase(df['text_en'][1],
           num_return_sequences=3,
           no_repeat_ngram_size=3)

In [None]:
paraphrases_en = [paraphrase(text, num_return_sequences=3, no_repeat_ngram_size=3) for text in df['text_en']]

paraphrases_en[1]

In [None]:
paraphrases_pl = []

for paraph_texts_en in paraphrases_en:
  paraph_texts_pl = [translator.translate(sentence, src='en', dest='pl').text for sentence in paraph_texts_en]
  paraphrases_pl.append(paraph_texts_pl)

paraphrases_pl[1]

In [None]:
import random
import re

def shuffle_sentences(text):
    sentences = text.split('.')
    random.shuffle(sentences)
    shuffled_text = '.'.join(sentences)
    return shuffled_text


In [None]:
paraphrases_pl_shfl = []

for paraph_texts_pl in paraphrases_pl:
  paraph_texts_pl_shfl = [shuffle_sentences(text) for text in paraph_texts_pl]
  paraphrases_pl_shfl.append(paraph_texts_pl_shfl)

paraphrases_pl_shfl[1]

In [None]:
df_row = df.iloc[1]
df_row['text'] = 'ala ma kota'

In [None]:
type(df_row)

In [None]:
oversampled_df_arr = []

for i in range(len(df)):
  for j in range(3):
    df_row = df.iloc[i]
    df_row['text'] = paraphrases_pl_shfl[i][j]
    oversampled_df_arr.append(df_row)

oversampled_df = pd.DataFrame(oversampled_df_arr)
oversampled_df = oversampled_df.reset_index(drop=True)
oversampled_df.head(5)

In [None]:
random.shuffle(oversampled_df_arr)
oversampled_df = pd.DataFrame(oversampled_df_arr)
oversampled_df = oversampled_df.reset_index(drop=True)
oversampled_df.head(5)

In [None]:
oversampled_df = oversampled_df.drop(['text_en'], axis=1)
oversampled_df

In [None]:
oversampled_df.to_csv('wibit_form_oversampled.csv')