In [1]:
import pandas as pd
import json
import numpy as np
import gensim
from transvec.transformers import TranslationWordVectorizer

In [2]:
df = pd.read_json("input/cleaned_raw_text.json")

In [3]:
df.to_csv("input/cleaned_raw_text.csv")
df

Unnamed: 0,text,withheld_in_countries,hashtags,lang,possibly_sensitive,verified_account,followers_count,location,withheld_anywhere,neg,neu,pos,compound,popularity_score
0,کرونا ایسے تباہی پھیلا رہا ہے جیسے 22 سال کی ...,[IN],[],ur,0,False,4,,True,0.0,0.000,0.000,0.0000,0.000031
1,کرونا ایسے تباہی پھیلا رہا ہے جیسے 22 سال کی ...,[IN],[],ur,0,False,4,,True,0.0,0.000,0.000,0.0000,0.000031
2,جسکو ہمارے ملک کے لبدلز معصوم بنا کر پیش کررہ...,[IN],[],ur,0,False,371,Pakistan,True,0.0,0.000,0.000,0.0000,0.005671
3,"If it is blood clotting, wont it clot the blo...",[IN],[],en,0,False,2471,,True,0.0,0.916,0.084,0.3384,0.015019
4,"If it is blood clotting, wont it clot the blo...",[IN],[],en,0,False,2471,,True,0.0,0.916,0.084,0.3384,0.015019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70883,私に殺陣は楽しいんだよ🎶\nと教えてくれた方！先生が！\nなんと！新プロジェクトを立ち上げ...,[],[],ja,0,False,316,,False,0.0,0.000,0.000,0.0000,0.019546
70884,"todo el mundo manda foto de pinta el 31, mejo...",[],[],es,0,False,178,,False,0.0,0.000,0.000,0.0000,0.000600
70885,@iiSpez درر,[],[],ar,0,False,13770,,False,0.0,0.000,0.000,0.0000,0.002848
70886,ja participo des do ano passado,[],[],pt,0,False,404,,False,0.0,0.000,0.000,0.0000,0.001821


In [6]:
pd.read_csv("input/cleaned_raw_text.csv", low_memory = False)

Unnamed: 0.1,Unnamed: 0,text,withheld_in_countries,hashtags,lang,possibly_sensitive,verified_account,followers_count,location,withheld_anywhere,neg,neu,pos,compound,popularity_score
0,0,کرونا ایسے تباہی پھیلا رہا ہے جیسے 22 سال کی ...,['IN'],[],ur,0,False,4.0,,True,0.0,0.000,0.000,0.0000,0.000031
1,1,کرونا ایسے تباہی پھیلا رہا ہے جیسے 22 سال کی ...,['IN'],[],ur,0,False,4.0,,True,0.0,0.000,0.000,0.0000,0.000031
2,2,جسکو ہمارے ملک کے لبدلز معصوم بنا کر پیش کررہ...,['IN'],[],ur,0,False,371.0,Pakistan,True,0.0,0.000,0.000,0.0000,0.005671
3,3,"If it is blood clotting, wont it clot the blo...",['IN'],[],en,0,False,2471.0,,True,0.0,0.916,0.084,0.3384,0.015019
4,4,"If it is blood clotting, wont it clot the blo...",['IN'],[],en,0,False,2471.0,,True,0.0,0.916,0.084,0.3384,0.015019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70886,70883,私に殺陣は楽しいんだよ🎶\nと教えてくれた方！先生が！\nなんと！新プロジェクトを立ち上げ...,[],[],ja,0,False,316.0,,False,0.0,0.000,0.000,0.0000,0.019546
70887,70884,"todo el mundo manda foto de pinta el 31, mejo...",[],[],es,0,False,178.0,,False,0.0,0.000,0.000,0.0000,0.000600
70888,70885,@iiSpez درر,[],[],ar,0,False,13770.0,,False,0.0,0.000,0.000,0.0000,0.002848
70889,70886,ja participo des do ano passado,[],[],pt,0,False,404.0,,False,0.0,0.000,0.000,0.0000,0.001821


In [4]:
import requests
import os
import zipfile

def download_model(url, filename, removeFile = True):
    #download the model
    if not os.path.exists(f"input/{filename}"):
        with open("input/model-temp.zip", "wb") as f: #download
            f.write(requests.get(url).content)
            
        with zipfile.ZipFile("input/model-temp.zip") as zf: #extract the data for the model
            zf.extract("model.bin", "./input/")

        os.rename("input/model.bin", f"input/{filename}")
        os.remove("input/model-temp.zip")

    #make it
    model = gensim.models.KeyedVectors.load_word2vec_format(f"input/{filename}", binary=True)
    
    if removeFile:
        os.remove(f"input/{filename}")
    
    return model

In [5]:
def sentence2vec(sentence, model):
    vector = None
    n = 0
    
    def removePunctNumbers(sentence):
        toRemove = "²&~\"#'{}'([])|-`\_^@+1234567890°$£µ*%!§:/;.,?<>€"
        for char in toRemove: sentence = sentence.replace(char, "")
        return sentence.lower()
    
    for word in removePunctNumbers(sentence).split(" "):
        if word in model:
            if vector is None: vector = model[word]
            else: vector = vector + model[word]
            n += 1
    
    if n > 0:
        return vector / n
    else:
        return np.zeros(300, dtype=np.float32)

In [6]:
#from http://vectors.nlpl.eu/repository/
models = {
    "en": "http://vectors.nlpl.eu/repository/20/8.zip",
    #"ru": "http://vectors.nlpl.eu/repository/20/180.zip",
    "fr": "http://vectors.nlpl.eu/repository/20/43.zip", #too big
    "ur": "http://vectors.nlpl.eu/repository/20/72.zip",
    #"es": "http://vectors.nlpl.eu/repository/20/68.zip", #too big
    "ar": "http://vectors.nlpl.eu/repository/20/31.zip",
    #"in": "http://vectors.nlpl.eu/repository/20/50.zip", #too big
    #"ja": "http://vectors.nlpl.eu/repository/20/53.zip", #too big
    #"ko": "http://vectors.nlpl.eu/repository/20/55.zip", #too big
    #"pt": "http://vectors.nlpl.eu/repository/20/63.zip",
    #there is no model for thai in this repository
    #"tr": "http://vectors.nlpl.eu/repository/20/70.zip", #too big
    #und is probably "undetermined language"
}
referrence_words = ["water", "sun", "sky", "king", "night", "year"]
ref_trans = {
    "ru": ['вода', 'солнце', 'небо', 'король', 'сутки', 'год'],
    "fr": ["eau", "soleil", "ciel", "roi", "nuit", "année"],
    "ur": ['پانی', 'سورج', 'آسمان', 'بادشاہ', 'رات', 'سال'], #it seems to be reversed but it's not
    "es": ['agua', 'dom', 'el cielo', 'el rey', 'noche', 'año'],
    "ar": ["ماء", "الشمس", "سماء", "ملِك", "ليل", "عام"], #it seems to be reversed but it's not
    "in": ['panas', 'matahari', 'langit', 'raja', 'malam', 'tahun'],
    "ja": ['水', '日', '空', '王', '泊', '年'], 
    "ko": ['물', '일', '하늘', '왕', '박', '년'],
    "pt": ['água', 'eno', 'céu', 'cama', 'noite', 'ano'],
    "tr": ['su', 'güneş', 'gökyüzü', 'kral', 'gece', 'yıl']
}

In [7]:
#get the model for english
en_model = download_model(models["en"], "model_en.bin", removeFile=False)

#get the raw numpy array of the data and add 300 new features: the components of word2vec vectors
data = df.values
columns = list(df.columns)
del df
data = np.c_[data, np.zeros((data.shape[0], 300))]

In [8]:
#apply sentence2vec to tweets in a certain lang
def vectorizeLang(lang, model, data):
    for i, line in enumerate(data):
        text, langLine = line[0], line[3]
        if langLine == lang:
            data[i][-300:] = sentence2vec(text, model)

In [9]:
#let's begin with english
vectorizeLang("en", en_model, data)

#save the data in a json file, in case
columns += list([f"vec{i+1}" for i in range(300)])
df = pd.DataFrame(data, columns = columns)
df.to_json("input/clean_with_w2v.json")
del en_model
del df

In [10]:
#once it's done, it's time to do it for other languages
#since the models aren't in the same vector space, we use TranslationWordVectorizer

for lang in ref_trans:
    if lang not in models: continue
    print(lang)
    #download the model
    lang_model = download_model(models[lang], f"model_{lang}.bin", False)
    en_model = download_model(models["en"], "model_en.bin", False)
    print("ok")
    
    #time to compute the bilingual model -> model for lang that is in the same vector space
    #as the model of english
    train_ref = [(a, b) for a, b in zip(referrence_words, ref_trans[lang])]
    bilingual_model = TranslationWordVectorizer(en_model, lang_model).fit(train_ref)
    print("bilingual ok")
    
    #let's delete lang_model, it's no longer useful and takes a lot of memory
    del lang_model
    del en_model
    
    vectorizeLang(lang, bilingual_model, data)
    df = pd.DataFrame(data, columns = columns)
    df.to_json("input/clean_with_w2v.json")
    print("lang completed ---")
    
    del bilingual_model
    del df

fr
ok
bilingual ok
lang completed ---
ur
ok
bilingual ok
lang completed ---
ar
ok
bilingual ok
lang completed ---
