In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('sentence-transformers/LaBSE')

In [3]:
TAJIK_PATH = '..\\data\\tajik_titles_translated.csv'
PERSIAN_PATH = '..\\data\\persian_titles_translated.csv'

In [4]:
tajik = pd.read_csv(TAJIK_PATH, names=['id', 'title', 'some'])
persian = pd.read_csv(PERSIAN_PATH, names=['id', 'title', 'some'])

In [5]:
tajik['title'] = tajik['title'].astype(str) + tajik['some'].astype(str)
tajik = tajik.drop(columns=['some'])

persian['title'] = persian['title'].astype(str) + persian['some'].astype(str)
persian = persian.drop(columns=['some'])


In [6]:
TAJIK_EMBEDINGS_PATH = '..//data//tajik_titles_translated_embedings.csv'

tajik['embeding'] = tajik['title'].apply(lambda x: model.encode(x))
tajik.to_csv(TAJIK_EMBEDINGS_PATH)
del tajik

In [7]:
PERSIAN_EMBEDINGS_PATH = '..//data//persian_titles_translated_embedings.csv'

persian['embeding'] = persian['title'].apply(lambda x: model.encode(x))
persian.to_csv(PERSIAN_EMBEDINGS_PATH)
del persian

In [8]:
import ast

def embiding_parse(embeding_str: str):
    parsed_embeding_str = embeding_str\
        .replace('\n', ',')\
        .replace(' ', ',')\
        .replace(',,,,', ',')\
        .replace(',,,', ',')\
        .replace(',,', ',')\
        .replace('[,', '[')
    return ast.literal_eval(parsed_embeding_str)

In [9]:
tajik = pd.read_csv(TAJIK_EMBEDINGS_PATH)
tajik['embeding'] = tajik['embeding'].apply(lambda x: embiding_parse(x))

In [10]:
persian = pd.read_csv(PERSIAN_EMBEDINGS_PATH)
persian['embeding'] = persian['embeding'].apply(lambda x: embiding_parse(x))

In [11]:
def find_simulary(embeding):
    calc = persian['embeding'].apply(lambda x: np.dot(x, embeding))
    return [calc.max(), persian.loc[calc.idxmax()]['id']]

def find_simulary_tajik(embeding):
    calc = tajik['embeding'].apply(lambda x: np.dot(x, embeding))
    return [calc.max(), tajik.loc[calc.idxmax()]['id']]

In [88]:
tajik.head(5)['embeding'].apply(find_simulary)

0    [0.7108551938220737, 10845]
1    [0.6146326864538375, 21702]
2    [0.4404012685231342, 20310]
3     [0.430628157451453, 19806]
4    [0.5018471236307425, 12116]
Name: embeding, dtype: object

In [12]:
persian_prob = persian['embeding'].apply(find_simulary_tajik)

In [13]:
persian['res'] = persian_prob
persian['prob'] = persian['res'].apply(lambda x: x[0])
persian['tajik_id'] = persian['res'].apply(lambda x: x[1])

In [14]:
persian = persian.drop(columns=['res'])

In [15]:
persian.drop(columns=['Unnamed: 0'])

Unnamed: 0,id,title,embeding,prob,persian_id
0,10058,Eight Lebanese soldiers were killed near the b...,"[0.0208818298, 0.0724064931, 0.0123083172, -0....",0.946966,1316
1,10030,Uzbek officials opposed the Raghon Foundation nan,"[0.00152296491, 0.0177091397, -0.0251617655, -...",0.634935,1319
2,10022,Mehr Housing and Possibility of Reproduction '...,"[0.0236953348, 0.0207191147, -0.00671503972, -...",0.425583,208
3,1006,Iran -Saudi politicization and the future of o...,"[-0.0309686083, -0.0066529098, -0.0728808865, ...",0.580893,7984
4,10046,Afghan presidential reaction to playing audio ...,"[0.00393272564, 0.0227101017, -0.0412510596, -...",0.535213,971
...,...,...,...,...,...
36558,9995,World War I: Europe enters warnan,"[-0.03708263, -0.03282133, 0.00300379, 0.00223...",0.530968,1500
36559,9996,Baroneses resigned in protest of British polic...,"[0.0170705188, 0.0330301486, -0.00415912643, -...",0.844342,1312
36560,9997,Xavi Hernandez's farewell to the Spanish natio...,"[-0.0511766598, -0.00135263626, 0.0403328836, ...",0.496292,11023
36561,9998,Isis;Common danger for Baghdad and Erbilnan,"[0.0223881118, -0.0308610909, -0.0562509298, -...",0.529912,6401


In [16]:
persian.to_csv('..//data//persian_titles_calced.csv')

In [17]:
persian['prob'].describe()

count    36563.000000
mean         0.584612
std          0.104237
min          0.293799
25%          0.509856
50%          0.572873
75%          0.644929
max          1.000000
Name: prob, dtype: float64

In [18]:
PERSIAN_LINKS = '../data/beauty_links_persian_cleared.csv'
TAJIK_LINKS = '../data/beauty_links_tajik_cleared.csv'

In [19]:
persian_links = pd.read_csv(PERSIAN_LINKS)
tajik_links = pd.read_csv(TAJIK_LINKS)

In [20]:
persian_links = persian_links.drop(columns=['Unnamed: 0'])
tajik_links = tajik_links.drop(columns=['Unnamed: 0'])


In [21]:
tajik_links

Unnamed: 0,id,BeautyLink
0,0,https://www.bbc.com/tajik/news/2015/03/150331_...
1,1,https://www.bbc.com/tajik/news/2015/03/150331_...
2,2,https://www.bbc.com/tajik/news/2015/03/150330_...
3,3,https://www.bbc.com/tajik/news/2015/03/150329_...
4,4,https://www.bbc.com/tajik/news/2015/03/150329_...
...,...,...
12313,12330,https://www.bbc.com/tajik/news/2009/09/090922_...
12314,12331,https://www.bbc.com/tajik/news/2009/09/090922_...
12315,12332,https://www.bbc.com/tajik/news/2009/09/090922_...
12316,12333,https://www.bbc.com/tajik/news/2009/09/090922_...


In [22]:
persian = persian.join(persian_links.set_index('id'), on='id')
persian = persian.drop(columns=['Unnamed: 0'])

In [23]:
persian

Unnamed: 0,id,title,embeding,prob,persian_id,BeautyLink
0,10058,Eight Lebanese soldiers were killed near the b...,"[0.0208818298, 0.0724064931, 0.0123083172, -0....",0.946966,1316,https://www.bbc.com/persian/world/2014/08/1408...
1,10030,Uzbek officials opposed the Raghon Foundation nan,"[0.00152296491, 0.0177091397, -0.0251617655, -...",0.634935,1319,https://www.bbc.com/persian/world/2014/08/1408...
2,10022,Mehr Housing and Possibility of Reproduction '...,"[0.0236953348, 0.0207191147, -0.00671503972, -...",0.425583,208,https://www.bbc.com/persian/business/2014/08/1...
3,1006,Iran -Saudi politicization and the future of o...,"[-0.0309686083, -0.0066529098, -0.0728808865, ...",0.580893,7984,https://www.bbc.com/persian/iran/2015/03/15030...
4,10046,Afghan presidential reaction to playing audio ...,"[0.00393272564, 0.0227101017, -0.0412510596, -...",0.535213,971,https://www.bbc.com/persian/afghanistan/2014/0...
...,...,...,...,...,...,...
36558,9995,World War I: Europe enters warnan,"[-0.03708263, -0.03282133, 0.00300379, 0.00223...",0.530968,1500,https://www.bbc.com/persian/world/2014/08/1408...
36559,9996,Baroneses resigned in protest of British polic...,"[0.0170705188, 0.0330301486, -0.00415912643, -...",0.844342,1312,https://www.bbc.com/persian/world/2014/08/1408...
36560,9997,Xavi Hernandez's farewell to the Spanish natio...,"[-0.0511766598, -0.00135263626, 0.0403328836, ...",0.496292,11023,https://www.bbc.com/persian/sport/2014/08/1408...
36561,9998,Isis;Common danger for Baghdad and Erbilnan,"[0.0223881118, -0.0308610909, -0.0562509298, -...",0.529912,6401,https://www.bbc.com/persian/blogs/2014/08/1408...


In [24]:
persian = persian.rename(columns={"BeautyLink": "link"})

In [27]:
persian = persian.rename(columns={"persian_id": "tajik_id"})

In [25]:
tajik_links

Unnamed: 0,id,BeautyLink
0,0,https://www.bbc.com/tajik/news/2015/03/150331_...
1,1,https://www.bbc.com/tajik/news/2015/03/150331_...
2,2,https://www.bbc.com/tajik/news/2015/03/150330_...
3,3,https://www.bbc.com/tajik/news/2015/03/150329_...
4,4,https://www.bbc.com/tajik/news/2015/03/150329_...
...,...,...
12313,12330,https://www.bbc.com/tajik/news/2009/09/090922_...
12314,12331,https://www.bbc.com/tajik/news/2009/09/090922_...
12315,12332,https://www.bbc.com/tajik/news/2009/09/090922_...
12316,12333,https://www.bbc.com/tajik/news/2009/09/090922_...


In [28]:
persian = persian.set_index('tajik_id').join(tajik_links.set_index('id'))

In [29]:
persian = persian.rename(columns={"BeautyLink": "tajik_link"})

In [30]:
persian

Unnamed: 0,id,title,embeding,prob,link,tajik_link
1,26706,Flying Homay on the European Nights;Overview o...,"[-0.0635568723, -0.00986633915, -0.0550234765,...",0.406428,https://www.bbc.com/persian/arts/2012/11/12110...,https://www.bbc.com/tajik/news/2015/03/150331_...
1,33564,Iran in the past week;May 1nan,"[0.0262209456, 0.0188858435, -0.0455361269, -0...",0.504243,https://www.bbc.com/persian/iran/2011/05/11051...,https://www.bbc.com/tajik/news/2015/03/150331_...
1,37125,Iran in the past week;May 1nan,"[0.0262209456, 0.0188858435, -0.0455361269, -0...",0.504243,https://www.bbc.com/persian/iran/2010/04/10042...,https://www.bbc.com/tajik/news/2015/03/150331_...
1,37218,Iran in the past week;May 1nan,"[0.0262209456, 0.0188858435, -0.0455361269, -0...",0.504243,https://www.bbc.com/persian/iran/2010/04/10042...,https://www.bbc.com/tajik/news/2015/03/150331_...
1,8915,International Crisis Group: Iran and P5+1 unde...,"[0.00212123571, 0.0163279343, -0.0451188162, -...",0.619521,https://www.bbc.com/persian/iran/2014/08/14082...,https://www.bbc.com/tajik/news/2015/03/150331_...
...,...,...,...,...,...,...
12331,531,The trial of two French police on the unrest i...,"[0.0348514356, 0.0483686663, -0.0152767906, -0...",0.585112,https://www.bbc.com/persian/world/2015/03/1503...,https://www.bbc.com/tajik/news/2009/09/090922_...
12331,6171,French police warning about 'scary clowns'nan,"[0.0362609997, 0.00225078152, -0.00292626, -0....",0.547861,https://www.bbc.com/persian/world/2014/10/1410...,https://www.bbc.com/tajik/news/2009/09/090922_...
12331,8508,French police attempt to prevent illegal migra...,"[0.02891889, 0.02081439, -0.01406876, -0.04142...",0.678811,https://www.bbc.com/persian/world/2014/09/1409...,https://www.bbc.com/tajik/news/2009/09/090922_...
12331,9473,French spy that wrote the story of the monkeysnan,"[0.00686233491, 0.0355358012, 0.013678777, -0....",0.492170,https://www.bbc.com/persian/arts/2014/08/14081...,https://www.bbc.com/tajik/news/2009/09/090922_...


In [31]:
persian.index.name = 'tajik_id'
persian = persian.drop(columns=['embeding'])
persian.to_csv('../data/matched-links.csv')

In [38]:
persian[persian['prob'] > .64].shape

(9623, 5)

In [None]:
# https://www.bbc.com/persian/sport/2013/09/130921_l51_wrestling_greco_roman_day2#TWEET897522
# https://www.bbc.com/tajik/institutional/2012/08/120806_rm_olympic_soryan_gold_wrestling
# 0.76

In [3]:
import pandas as pd

tajik = pd.read_csv('../data/matched_links.csv')

In [4]:
tajik.describe()

Unnamed: 0,persian_id,id,prob
count,9412.0,9412.0,9412.0
mean,15249.805461,5712.878028,0.589102
std,5984.667738,3653.695713,0.108111
min,11.0,0.0,0.317549
25%,12161.25,2407.75,0.511486
50%,15692.5,6240.5,0.575388
75%,19670.0,8776.5,0.652383
max,24230.0,12334.0,0.992012
