<a href="https://colab.research.google.com/github/DarioRugg/KickLearning/blob/main/feature_analysis/Text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!pip install pycld2
!pip install sentencepiece
!pip install transformers
!pip install polyglot
!pip install pyicu
!pip install pySBD

In [None]:
from os.path import join
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
from polyglot.detect import Detector
from transformers.hf_api import HfApi
import torch
import pysbd
from polyglot.detect.base import logger as polyglot_logger
import time
polyglot_logger.setLevel("ERROR")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
file_name = 'file_0000_scraped.csv'
data_path = join('.', 'drive', 'MyDrive', 'Project', 'Data', 'Scraped')
file_path = join(data_path, file_name)
file_path

In [None]:
df = pd.read_csv(file_path)

In [None]:
def detect_lang(inp):
  return Detector(remove_bad_chars(str(inp)), quiet=True).languages

In [None]:
%%capture
import regex
 
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
 
def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)
langs = [df[var].apply(detect_lang) for var in ['story', 'risks', 'creator_bio']]

In [None]:
langs_df = pd.concat(langs, axis=1)

In [None]:
def glob_lang(langs):
  lang1, lang2, lang3 = [{'code':x.code, 'conf':x.confidence} for x in langs]
  codes = [lang1['code'], lang2['code'], lang3['code']]
  confs = [lang1['conf'], lang2['conf'], lang3['conf']]
  if lang1['conf'] > 80 and lang2['conf'] <= 10:
    glob_l = lang1['code']
  elif lang2['conf'] > 10:
    glob_l = [lang for i, lang in enumerate(codes) if confs[i] > 10]
    if 'en' in glob_l:
      glob_l = 'multi_en'
    else:
      glob_l = 'multi'
  else:
    glob_l = 'unknown'
  return glob_l
globs = langs_df['story'].apply(glob_lang)

In [None]:
df['lang'] = globs

In [None]:
multilanguages = {x for x in set(globs) if 'multi_' in x}

In [None]:
model_list = HfApi().model_list()
org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids]

In [None]:
html_chars = regex.compile(r"\s*You\'ll\s*need\s*an\s*HTML5\s*capable\s*browser\s*to\s*see\s*this\s*content\s*\.\s*(\n\s)*\s*|\s*Play\s*(\n\s)+\s*|/\s*Indicator\s*Bar\s*\d\s*(\n\s)*|/\s*Animation\s*Variables\s*(\n\s)\s*|(Replay|Play)\s*with\s*sound\s*(\n\s)+|(\n\s)+|\xa0")
def remove_html_and_special(text):
  return html_chars.sub("", str(text))

In [None]:
def primary_lang_filter(text, segmenter, lang):
  segmented = segmenter.segment(str(text))
  return ' '.join([segmented[i] for i,x in enumerate(list(map(detect_lang, segmented))) if x[0].code == lang and x[0].confidence >= 60])

In [None]:
new_df = df.copy()

In [None]:
new_df.loc[:,'story'] = new_df.loc[:,'story'].apply(remove_html_and_special)

In [None]:
for l in multilanguages:
  lang = l.split('_')[1]
  seg = pysbd.Segmenter(language=lang if lang!='sv' else 'da')
  temp = new_df.loc[new_df['lang'] == l]
  text_list = list(map(str, temp[['story']].to_numpy().flatten()))
  filtered = list(map(lambda x: primary_lang_filter(x, seg, lang), text_list))
  new_df.loc[new_df['lang'] == l,'story'] = np.array(filtered).reshape(temp['story'].shape)
  new_df.loc[new_df['lang'] == l,'lang'] = lang

In [17]:
start = time.time()
batch_size = 60
set_l = set(df['lang'])
set_l = set_l.intersection(set(map(lambda x: x.split('-')[2] if 'en' in x.split('-')[3:] else None, suffix)))
set_l = set_l.intersection(set(pysbd.languages.LANGUAGE_CODES.keys()))
set_l = set_l.union({'sv'})
 
for l in set_l:
  temp = new_df.loc[new_df['lang'] == l]
  text_list = list(map(str, temp[['story']].to_numpy().flatten()))
  model_name = f'Helsinki-NLP/opus-mt-{l}-en'
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name).to(device)
  seg = pysbd.Segmenter(language=l if l!='sv' else 'da')
  translations = []
  for text in text_list:
    inp = seg.segment(text)
    if len(inp) > batch_size:
      all_decoded = []
      for batch in np.array_split(inp, np.ceil(len(inp)/batch_size)):
        tok = tokenizer(batch.tolist(), return_tensors="pt", padding=True).to(device)
        translated = model.generate(**tok)
        decoded = ' '.join([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
        all_decoded.append(decoded)
      translations.append(' '.join(all_decoded))
    else:
      tok = tokenizer(inp, return_tensors="pt", padding=True).to(device)
      translated = model.generate(**tok)
      decoded = ' '.join([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
      translations.append(decoded)
  new_df.loc[new_df['lang'] == l,'story'] = np.array(translations).reshape(temp['story'].shape)
 
print(f"Total time for translation was {round(time.time() - start, 2)} seconds")

In [None]:
%%capture
globs1 = new_df['story'].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages).apply(glob_lang)