<a href="https://colab.research.google.com/github/DarioRugg/KickLearning/blob/main/feature_analysis/Text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
!pip install pycld2
!pip install sentencepiece
!pip install transformers
!pip install polyglot
!pip install pyicu

In [3]:
from os.path import join
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
from polyglot.detect import Detector


In [4]:
file_name = 'file_0000_scraped.csv'
data_path = join('.', 'drive', 'MyDrive', 'Project', 'Data', 'Scraped')

In [5]:
file_path = join(data_path, file_name)
file_path

'./drive/MyDrive/Project/Data/Scraped/file_0000_scraped.csv'

In [6]:
df = pd.read_csv(file_path)

In [7]:
%%capture
import regex

RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)
langs = [df[var].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages) for var in ['story', 'risks', 'creator_bio']]

In [8]:
langs_df = pd.concat(langs, axis=1)

In [41]:
def glob_lang(langs):
  lang1, lang2, lang3 = [{'code':x.code, 'conf':x.confidence} for x in langs]
  codes = [lang1['code'], lang2['code'], lang3['code']]
  confs = [lang1['conf'], lang2['conf'], lang3['conf']]
  if lang1['conf'] > 80 and lang2['conf'] <= 10:
    glob_l = lang1['code']
  elif lang2['conf'] > 10:
    glob_l = [lang for i, lang in enumerate(codes) if confs[i] > 10]
    if 'en' in glob_l:
      glob_l = 'multi_en'
    else:
      glob_l = 'multi'
  else:
    glob_l = 'unknown'
  return glob_l
globs = langs_df['story'].apply(glob_lang)

In [42]:
globs.value_counts()

en          14122
multi_en      428
es            154
unknown        90
fr             80
de             60
sv             22
it             19
da              6
nl              6
multi           5
no              4
ja              2
ga              1
mg              1
Name: story, dtype: int64

In [11]:
df['lang'] = globs

In [12]:
from transformers.hf_api import HfApi
model_list = HfApi().model_list()
org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids]
old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]

In [13]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [14]:
new_df = df.copy()

In [15]:
import time
start = time.time()
buffer=30
for l in set(df['lang']).intersection(set(map(lambda x: x.split('-')[2] if x.split('-')[3] == 'en' else 'it', suffix))):
  temp = df[df['lang'] == l]
  text_list = list(map(str, temp[['story', 'risks', 'creator_bio']].to_numpy().flatten()))
  model_name = f'Helsinki-NLP/opus-mt-{l}-en'
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name).to(device)
  decoded = []
  if len(text_list) > buffer:
    for i in range(len(text_list)//buffer):
      translated = model.generate(**tokenizer(text_list[i*buffer:(i+1)*buffer], return_tensors="pt", padding=True, truncation=True).to(device))
      decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    translated = model.generate(**tokenizer(text_list[(i+1)*buffer:], return_tensors="pt", padding=True, truncation=True).to(device))
    decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
  else:
    translated = model.generate(**tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(device))
    decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
  temp[['story', 'risks', 'creator_bio']] = np.array(decoded).reshape(temp[['story', 'risks', 'creator_bio']].shape)
  new_df[new_df['lang'] == l] = temp
print(f"Total time for translation was {round(time.time() - start, 2)} seconds")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=839226.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=803492.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355878.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=44.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1145.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=295018473.0, style=ProgressStyle(descri…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=802397.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778395.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1339166.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=300827685.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=805781.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=796289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1227005.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=290924733.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=796845.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=768489.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1273232.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=297928209.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=819654.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=788462.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382006.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1132.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=299629317.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=825924.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801636.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1590040.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=44.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1189.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=312087523.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=781853.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801883.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1501341.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=303294189.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=813709.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=789549.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2369833.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=343642677.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=815294.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=790085.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1291486.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=294507525.0, style=ProgressStyle(descri…


Total time for translation was 2115.54 seconds


In [32]:
%%capture
globs1 = new_df['story'].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages).apply(glob_lang)