<a href="https://colab.research.google.com/github/DarioneNazionale/KickLearning/blob/main/feature_analysis/Text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [51]:
%%capture
!pip install pycld2
!pip install sentencepiece
!pip install transformers
!pip install polyglot
!pip install pyicu

In [3]:
from os.path import join
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import numpy as np
from polyglot.detect import Detector


In [4]:
file_name = 'file_0000_scraped.csv'
data_path = join('.', 'drive', 'MyDrive', 'Project', 'Data', 'Scraped')

In [5]:
file_path = join(data_path, file_name)
file_path

'./drive/MyDrive/Project/Data/Scraped/file_0000_scraped.csv'

In [6]:
df = pd.read_csv(file_path)

In [50]:
%%capture
import regex

RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)
langs = [df[var].apply(lambda x: Detector(remove_bad_chars(str(x)), quiet=True).languages) for var in ['story', 'risks', 'creator_bio']]

In [73]:
def language_tuples(langs):
  
print([(obj.code, obj.confidence) for obj in langs[0].loc[0]])

[('fr', 99.0), ('un', 0.0), ('un', 0.0)]


In [34]:
len([i for i, x in enumerate(langs) if x[0].confidence in range(80,100) and x[0].code != 'en'])

371

In [29]:
[print(x) for x in langs[9776]]

name: English     code: en       confidence:  28.0 read bytes:  1095
name: Spanish     code: es       confidence:  26.0 read bytes:   593
name: Tagalog     code: tl       confidence:  25.0 read bytes:   931


[None, None, None]

In [31]:
df.loc[12633].project_url

'https://www.kickstarter.com/projects/1114450323/perfect-watches-kahiki-the-spirit-of-hawaii?ref=discovery_category_newest'

In [10]:
l_names = [x[0] for x in langs]
freqs = {y: l_names.count(y) for y in set(l_names)}

In [11]:
df['lang'] = l_names

In [12]:
df.head()

Unnamed: 0,pledged,state,usd_pledged,deadline,id,state_changed_at,created_at,fx_rate,disable_communication,goal,backers_count,launched_at,currency,country,category,sub_category,project_url,creator_id,year,image,has_video,story,risks,creator_bio,n_tiers,tiers_values,n_images,n_gifs,n_websites,fb_linked,n_collab,collab_names,lang
0,150.0,successful,181.139294,1526787000,334251380,1526787001,1523996277,1.197522,False,150.0,8,1525276383,EUR,FR,Film & Video,Festivals,https://www.kickstarter.com/projects/sm4shostu...,1844740738,2021,https://ksr-ugc.imgix.net/assets/020/924/958/b...,False,Mais qui sommes nous ? \n \n \n Installation O...,Même si les fonds nécessaires ne sont pas atte...,Canapé Fight est une page autour du partage et...,3.0,"[2, 5, 40]",12.0,0.0,1.0,False,0.0,[],name: French code: fr confidence: ...
1,318.0,successful,318.0,1484851112,424206421,1484851112,1480346662,1.0,False,200.0,6,1483555112,USD,US,Fashion,Apparel,https://www.kickstarter.com/projects/stevewort...,1339310935,2021,https://ksr-ugc.imgix.net/assets/014/689/968/e...,True,\n \n \n Black stick figure design will be scr...,I can't see any risks or challenges once I'm f...,Artist Steve Worthington has lived and worked ...,3.0,"[20, 23, 43]",15.0,1.0,6.0,True,0.0,[],name: English code: en confidence: ...
2,16143.0,failed,16143.0,1470940250,1507824580,1470940250,1412445314,1.0,False,80000.0,94,1468348250,USD,US,Film & Video,Comedy,https://www.kickstarter.com/projects/concordmo...,1854263224,2021,https://ksr-ugc.imgix.net/assets/011/895/127/e...,True,My Advice is the story of a man trying to do w...,PRE-PRODUCTION\r\nDuring pre-production we put...,Thanks for supporting independent filmmaking!\...,16.0,"[3, 5, 7, 20, 25, 25, 25, 30, 35, 50, 150, 275...",9.0,0.0,4.0,True,0.0,[],name: English code: en confidence: ...
3,3776.0,successful,4039.798874,1482083487,100089599,1482083487,1477479904,1.197522,False,3286.0,44,1479491487,EUR,FR,Fashion,Apparel,https://www.kickstarter.com/projects/190850067...,1908500678,2021,https://ksr-ugc.imgix.net/assets/014/282/394/8...,True,NB : English speakers please switch on subtitl...,"The brand has been existing for over 4 years, ...",PAVÉ. has been existing for 4 years. We are bo...,12.0,"[1, 5, 35, 70, 70, 80, 95, 130, 170, 220, 290,...",42.0,0.0,1.0,True,0.0,[],name: English code: en confidence: ...
4,26668.0,successful,26668.0,1541203200,1672479848,1541203200,1527023959,1.0,False,25000.0,345,1538662232,USD,US,Film & Video,Documentary,https://www.kickstarter.com/projects/230068012...,230068012,2021,https://ksr-ugc.imgix.net/assets/021/321/161/f...,True,\n \n \n \n \n The United States currently ran...,"As mentioned in the section above, fundraising...",Hillary Bachelder (director) is a graduate of ...,9.0,"[10, 25, 50, 100, 250, 500, 1, 5, 10]",23.0,0.0,1.0,False,0.0,[],name: English code: en confidence: ...


In [13]:
from transformers.hf_api import HfApi
model_list = HfApi().model_list()
org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids]
old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]

In [14]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [15]:
buffer=30
for l in set(df['lang']).intersection(set(map(lambda x: x.split('-')[2] if x.split('-')[3] == 'en' else 'it', suffix))):
  if l != 'en':
    temp = df[df['lang'] == l]
    text_list = list(map(str, temp[['story', 'risks', 'creator_bio']].to_numpy().flatten()))
    model_name = f'Helsinki-NLP/opus-mt-{l}-en'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    decoded = []
    if len(text_list) > buffer:
      for i in range(len(text_list)//buffer):
        translated = model.generate(**tokenizer(text_list[i*buffer:(i+1)*buffer], return_tensors="pt", padding=True, truncation=True).to(device))
        decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
      translated = model.generate(**tokenizer(text_list[(i+1)*buffer:], return_tensors="pt", padding=True, truncation=True).to(device))
      decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    else:
      translated = model.generate(**tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(device))
      decoded += [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    temp[['story', 'risks', 'creator_bio']] = np.array(decoded).reshape(temp[['story', 'risks', 'creator_bio']].shape)
    df[df['lang'] == l] = temp
  break  

In [16]:
temp

NameError: ignored

In [None]:
df.loc[11721]['story']

In [None]:
l