In [None]:
import pandas as pd

categories_dict = {'Koła widokowe': 'ferris_wheels',
                  'Areny sportów zimowych': 'winter_sports',
                  'Świątynie hinduizmu': 'hindu_temples',
                  'Obiekty archeologiczne': 'archaeology',
                  'Kopce/kurchany': 'tumuluses',
                  'Muzea biograficzne': 'biographical_museums',
                  'Muzea modowe': 'fashion_museums',
                  'Parki rozrywki': 'amusement_parks',
                  'Parki wodne': 'water_parks',
                  'Parki miniatur': 'miniature_parks',
                  'Baseny, termy i sauny': 'baths_and_saunas',
                  'Ścianki wspinaczkowe': 'climbing',
                  'Stadiony': 'stadiums',
                  'Źródła': 'natural_springs',
                  'Rzeki, kanały, wodospady': 'water',
                  'Rezerwaty przyrody': 'nature_reserves',
                  'Plaże': 'beaches',
                  'Stacje kolejowe': 'railway_stations',
                  'Zapory': 'dams',
                  'Mennice': 'mints',
                  'Kopalnie': 'mineshafts',
                  'Muzea nauki i techniki': 'science_museums',
                  'Kościoły': 'churches',
                  'Katedry': 'cathedrals',
                  'Klasztory': 'monasteries',
                  'Synagogi': 'synagogues',
                  'Meczety': 'mosques',
                  'Zamki': 'castles',
                  'Wieże obronne': 'fortified_towers',
                  'Bunkry': 'bunkers',
                  'Muzea militarne': 'military_museums',
                  'Pola bitew': 'battlefields',
                  'Cmentarze wojenne': 'war_graves',
                  'Cmentarze': 'cemeteries',
                  'Mauzolea': 'mausoleums',
                  'Krypty': 'crypts',
                  'Murale': 'wall_painting',
                  'Fontanny': 'fountains',
                  'Rzeźby': 'sculptures',
                  'Zieleń miejska': 'gardens_and_parks',
                  'Muzea archeologiczne': 'archaeological_museums',
                  'Galerie sztuki': 'art_galleries',
                  'Muzea historyczne': 'history_museums',
                  'Muzea lokalne': 'local_museums',
                  'Muzea narodowe': 'national_museums',
                  'Planetaria': 'planetariums',
                  'Zoo': 'zoos',
                  'Akwaria': 'aquariums',
                  'Drapacze chmur': 'skyscrapers',
                  'Wieże (zegarowe, widokowe)': 'towers',
                  'Budynki historyczne': 'historic_architecture',
                  'Mosty': 'bridges',
                  'Pomniki': 'monuments'}

def read_form_result(csv_path, from_form=True):
  df = pd.read_csv(csv_path)
  if from_form:
    df = df.drop(df.columns[-6:], axis=1)

  df = df.drop(df.columns[0], axis=1)
  cols = df.columns

  new_col_names = ['text', 'date']

  if from_form:
    for col in cols[3:]:
      new_col_names_cat = [col.split('[')[-1][:-1] for col in cols[2:]]
  else:
    for col in cols[3:]:
      new_col_names_cat = [col.split('[')[-1] for col in cols[2:]]

  new_col_names += new_col_names_cat
  df.columns = [categories_dict[cat] if cat in categories_dict else cat for cat in new_col_names]

  return df

In [None]:
df_chat = read_form_result('chat_example.csv', False)
print(len(df_chat))
df_form = read_form_result('form.csv')
print(len(df_form))


In [None]:
for i in range(len(df_chat.columns)):
  if df_form.columns[i] != df_chat.columns[i]:
    print(df_form.columns[i], df_chat.columns[i])

In [None]:
chat_sample = df_chat.sample(16, random_state=2023)
df_chat = df_chat.drop(chat_sample.index)
chat_sample

In [None]:
form_sample = df_form.sample(10, random_state=2023)
df_form = df_form.drop(form_sample.index)
form_sample

In [None]:
len(df_form)

In [None]:
len(df_chat)

In [None]:
test_df = pd.concat([form_sample, chat_sample], ignore_index=True)
test_df = test_df.sample(frac=1, random_state=2023).reset_index(drop=True)
test_df

In [None]:
test_df.to_csv('test_df.csv', index=False)

In [None]:
train_df = pd.concat([df_form, df_chat], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=2023).reset_index(drop=True)
train_df.head(10)

In [None]:
train_df.shape

In [None]:
! pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


translator = Translator()

device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [None]:
import random
import re

def shuffle_sentences(text):
    sentences = text.split('.')
    random.shuffle(sentences)
    shuffled_text = '.'.join(sentences)
    return shuffled_text

In [None]:
def oversample_data(df_to_oversample, num=3):
  # create copy of dataframe
  df = df_to_oversample.copy()

  # translate texts to english
  translator = Translator()
  df['text_en'] = [translator.translate(sentence, src='pl', dest='en').text for sentence in df['text']]

  # english paraphrases with transformer
  paraphrases_en = [paraphrase(text, num_return_sequences=num, no_repeat_ngram_size=3) for text in df['text_en']]
  for i in range(len(paraphrases_en)):
      paraphrases_en[i].append(df['text_en'][i])

  # translate paraphrases to polish
  paraphrases_pl = []
  for paraph_texts_en in paraphrases_en:
    paraph_texts_pl = [translator.translate(sentence, src='en', dest='pl').text for sentence in paraph_texts_en]
    paraphrases_pl.append(paraph_texts_pl)

  # shuffle sentences in text
  paraphrases_pl_shfl = []
  for paraph_texts_pl in paraphrases_pl:
    paraph_texts_pl_shfl = [shuffle_sentences(text) for text in paraph_texts_pl]
    paraphrases_pl_shfl.append(paraph_texts_pl_shfl)

  # concat original and shuffled rows
  oversampled_df_arr = []

  for i in range(len(df)):
    for j in range(num+1):
      df_row = df.iloc[i]
      df_row['text'] = paraphrases_pl_shfl[i][j]
      oversampled_df_arr.append(df_row)

  oversampled_df = pd.DataFrame(oversampled_df_arr)
  oversampled_df = oversampled_df.reset_index(drop=True)

  result_df = pd.concat([df, oversampled_df], ignore_index=True)
  result_df = result_df.sample(frac=1, random_state=2023).reset_index(drop=True)

  return result_df


In [None]:
oversample_train_df = oversample_data(train_df)
oversample_train_df.to_csv('oversample_train_df.csv', index=False)

oversample_train_df.head(10)

In [None]:
oversample_train_df.shape

In [None]:
pd.read_csv('oversample_train_df.csv')