<a href="https://colab.research.google.com/github/DarioRugg/KickLearning/blob/main/feature_analysis/Text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
!pip install pycld2
!pip install sentencepiece
!pip install transformers
!pip install polyglot
!pip install pyicu
!pip install pySBD

In [3]:
from os.path import join
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
from polyglot.detect import Detector
import pysbd

In [4]:
file_name = 'file_0000_scraped.csv'
data_path = join('.', 'drive', 'MyDrive', 'Project', 'Data', 'Scraped')

In [5]:
file_path = join(data_path, file_name)
file_path

'./drive/MyDrive/Project/Data/Scraped/file_0000_scraped.csv'

In [6]:
df = pd.read_csv(file_path)

In [7]:
%%capture
import regex

RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)
langs = [df[var].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages) for var in ['story', 'risks', 'creator_bio']]

In [8]:
langs_df = pd.concat(langs, axis=1)

In [9]:
def glob_lang(langs):
  lang1, lang2, lang3 = [{'code':x.code, 'conf':x.confidence} for x in langs]
  codes = [lang1['code'], lang2['code'], lang3['code']]
  confs = [lang1['conf'], lang2['conf'], lang3['conf']]
  if lang1['conf'] > 80 and lang2['conf'] <= 10:
    glob_l = lang1['code']
  elif lang2['conf'] > 10:
    glob_l = [lang for i, lang in enumerate(codes) if confs[i] > 10]
    if 'en' in glob_l:
      glob_l = 'multi_en'
    else:
      glob_l = 'multi'
  else:
    glob_l = 'unknown'
  return glob_l
globs = langs_df['story'].apply(glob_lang)

In [10]:
df['lang'] = globs

In [11]:
from transformers.hf_api import HfApi
model_list = HfApi().model_list()
org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids]
old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]

In [12]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [13]:
device

'cuda:0'

In [14]:
new_df = df.copy()

In [18]:
import time
start = time.time()
batch_size = 60
set_l = set(df['lang'])
set_l = set_l.intersection(set(map(lambda x: x.split('-')[2] if 'en' in x.split('-')[3:] else None, suffix)))
set_l = set_l.intersection(set(pysbd.languages.LANGUAGE_CODES.keys()))
set_l = set_l.union({'sv'})
for l in set_l:
  temp = new_df.loc[new_df['lang'] == l]
  text_list = list(map(str, temp[['story']].to_numpy().flatten()))
  model_name = f'Helsinki-NLP/opus-mt-{l}-en'
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name).to(device)
  seg = pysbd.Segmenter(language=l if l!='sv' else 'da')
  translations = []
  for text in text_list:
    inp = seg.segment(text)
    if len(inp) > batch_size:
      all_decoded = []
      for batch in np.array_split(inp, np.ceil(len(inp)/batch_size)):
        tok = tokenizer(batch.tolist(), return_tensors="pt", padding=True).to(device)
        translated = model.generate(**tok)
        decoded = ' '.join([tokenizer.decode(t.to('cpu'), skip_special_tokens=True) for t in translated])
        all_decoded.append(decoded)
      translations.append(' '.join(all_decoded))
    else:
      tok = tokenizer(inp, return_tensors="pt", padding=True).to(device)
      translated = model.generate(**tok)
      decoded = ' '.join([tokenizer.decode(t.to('cpu'), skip_special_tokens=True) for t in translated])
      translations.append(decoded)
  new_df.loc[new_df['lang'] == l,'story'] = np.array(translations).reshape(temp['story'].shape)
  del model, tokenizer, translations, decoded, translated, inp, text_list, temp
  torch.cuda.empty_cache()
print(f"Total time for translation was {round(time.time() - start, 2)} seconds")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=819654.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=788462.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382006.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=299629317.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=781853.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801883.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1501341.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=303294189.0, style=ProgressStyle(descri…


Total time for translation was 1070.5 seconds


In [None]:
%%capture
globs1 = new_df['story'].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages).apply(glob_lang)