In [1]:
import pandas as pd
import numpy as np
import fasttext
from tqdm import tqdm

tqdm.pandas()


In [2]:
def latin_to_cyrillic(letter):
    kazakh_conversion = {
        'a': '–∞',
        '√°': '”ô',
        'b': '–±',
        'v': '–≤',
        'g': '–≥',
        '«µ': '“ì',
        'd': '–¥',
        'e': '–µ',
        'j': '–∂',
        'z': '–∑',
        'ƒ±': '–∏',
        'i': '—ñ',
        'k': '–∫',
        'q': '“õ',
        'l': '–ª',
        'm': '–º',
        'n': '–Ω',
        '≈Ñ': '“£',
        'o': '–æ',
        '√≥': '”©',
        'p': '–ø',
        'r': '—Ä',
        's': '—Å',
        't': '—Ç',
        '√Ω': '—É',
        'u': '“±',
        '√∫': '“Ø',
        'f': '—Ñ',
        'h': '—Ö',
        'ch': '—á',
        'sh': '—à',
        'y': '—ã'
    }
    
    if letter[:2] in kazakh_conversion:
        return kazakh_conversion[letter[:2]]
    elif letter[0] in kazakh_conversion:
        return kazakh_conversion[letter[0]]
    else:
        return letter

def text_to_cyrillic(text):
    i = 0
    cyrillic_text = ''
    
    while i < len(text):
        # If the next two characters form a special conversion (e.g., "ch", "sh"), then process them together
        if text[i:i+2] in ['ch', 'sh']:
            cyrillic_text += latin_to_cyrillic(text[i:i+2])
            i += 2
        else:
            cyrillic_text += latin_to_cyrillic(text[i])
            i += 1
    return cyrillic_text


In [11]:
lang_detect = fasttext.load_model('lid.176.bin')

def identify_language(text):
    """Identify the language of a given text using FastText."""
    predictions = lang_detect.predict(text, k=1)  # k=1 means we want the top 1 prediction
    # The prediction will be in the format ('__label__en',), so we extract 'en' from it
    return predictions[0][0].split('__label__')[1]





## Articles

In [13]:
df_articles = pd.read_csv('case1-datasaur/epir_train/articles.csv')


In [14]:
rows_to_keep = []

for index, row in tqdm(df_articles.iterrows(), total=len(df_articles)):
    title_lang = None
    try:
        title_lang = identify_language(row['title'])
    except:
        pass

    # Condition to keep the row
    if row['sys_lang'] == 'en':
        if title_lang == 'en':
            rows_to_keep.append(index)
    else:
        rows_to_keep.append(index)
    
df_articles = df_articles.loc[rows_to_keep]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100000/100000 [00:05<00:00, 19687.22it/s]


In [15]:
df_articles = df_articles[df_articles['title'].notna()]
df_articles = df_articles[df_articles['content'].notna()]

df_articles['title'] = df_articles['title'].str.lower()
df_articles['content'] = df_articles['content'].str.lower()

new_title = []
new_content = []
for i, row in tqdm(df_articles.iterrows(), total=len(df_articles)):
    if row['sys_lang'] == 'qq':
        new_title.append(text_to_cyrillic(row['title']))
        new_content.append(text_to_cyrillic(row['content']))
    else:
        new_title.append(row['title'])
        new_content.append(row['content'])

# df_articles['title'] = df_articles['title'].progress_apply(lambda x: text_to_cyrillic(x))
# df_articles['content'] = df_articles['content'].progress_apply(lambda x: text_to_cyrillic(x))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 94852/94852 [02:05<00:00, 755.39it/s]


In [16]:
df_articles['title'] = new_title
df_articles['content'] = new_content

df_articles

Unnamed: 0.1,Unnamed: 0,id,sys_lang,projects,title,content,url
0,2884,135747,ru,departament-kkbtu-sko,–±–µ—à–µ–Ω—Å—Ç–≤–æ ‚Äì —Å–º–µ—Ä—Ç–µ–ª—å–Ω–∞—è —É–≥—Ä–æ–∑–∞ –¥–ª—è —á–µ–ª–æ–≤–µ–∫–∞!,"–±–µ—à–µ–Ω—Å—Ç–≤–æ ‚Äì –ø—Ä–∏—Ä–æ–¥–Ω–æ-–æ—á–∞–≥–æ–≤–æ–µ –æ—Å–æ–±–æ –æ–ø–∞—Å–Ω–æ–µ, —Å...",https://www.gov.kz/memleket/entities/departame...
1,2885,119917,ru,kgd-karaganda,–ø–æ—Ä—è–¥–æ–∫ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–Ω–∏—è –Ω–∞–ª–æ–≥–∞ –Ω–∞ —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–Ω...,–≤ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–∏ —Å –ø—É–Ω–∫—Ç–æ–º 2 —Å—Ç–∞—Ç—å–∏ 66 –∑–∞–∫–æ–Ω–∞ ¬´–æ...,https://www.gov.kz/memleket/entities/kgd-karag...
2,2886,8414,ru,turkestan-tolebi-audany,—Å–ø–æ—Ä—Ç,–º–∏—Å—Å–∏—è: —Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ –µ–¥–∏–Ω–æ–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–π –ø–æ...,https://www.gov.kz/memleket/entities/turkestan...
3,2887,118430,kk,qazalem,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω–Ω—ã“£ –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –æ—Ä–≥–∞–Ω–¥–∞—Ä—ã–º–µ–Ω –±–∞–π–ª–∞–Ω—ã—Å,—ç–ª–µ–∫—Ç—Ä–æ–Ω–¥—ã“õ “Ø–∫—ñ–º–µ—Ç—Ç—ñ“£ –º“Ø–º–∫—ñ–Ω–¥—ñ–∫—Ç–µ—Ä—ñ–Ω —Ç–æ–ª—ã“õ –ø–∞–π...,https://www.gov.kz/memleket/entities/qazalem/p...
4,2888,5319,kk,kyzylorda-kzo,2022 –∂—ã–ª“ì–∞ –∂–µ—Ä–≥—ñ–ª—ñ–∫—Ç—ñ –∞—Ç“õ–∞—Ä—É—à—ã –æ—Ä–≥–∞–Ω–¥–∞—Ä–º–µ–Ω —Å—ã–±...,“õ—ã–∑—ã–ª–æ—Ä–¥–∞ “õ–∞–ª–∞—Å—ã–Ω—ã“£ ”ô–∫—ñ–º–¥—ñ–≥—ñ —Ä/—Å —ñ—Å-—à–∞—Ä–∞–ª–∞—Ä 20...,https://www.gov.kz/memleket/entities/kyzylorda...
...,...,...,...,...,...,...,...
99995,102879,23291,ru,sko-gm,—Ä–∞–∑—ä—è—Å–Ω–µ–Ω–∏–µ –∞–Ω—Ç–∏–∫–æ—Ä—Ä—É–ø—Ü–∏–æ–Ω–Ω–æ–≥–æ –∑–∞–∫–æ–Ω–æ–¥–∞—Ç–µ–ª—å—Å—Ç–≤–∞,–≤–æ –∏—Å–ø–æ–ª–Ω–µ–Ω–∏–µ –ø–ª–∞–Ω–∞ –º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–π –ø–æ —Å–Ω–∏–∂–µ–Ω–∏—é —É—Ä...,https://www.gov.kz/memleket/entities/sko-gm/pr...
99996,102880,55210,ru,kgd-sko,—Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ –¥–∞–Ω–Ω—ã–µ –ø–æ –æ–∫–∞–∑–∞–Ω–Ω—ã–º –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ...,–≤ —Ç–µ—á–µ–Ω–∏–µ 5 –º–µ—Å—è—Ü–µ–≤ 2021 –≥–æ–¥–∞ –æ—Ä–≥–∞–Ω–∞–º–∏ –≥–æ—Å—É–¥–∞—Ä...,https://www.gov.kz/memleket/entities/kgd-sko/p...
99997,102881,111387,kk,adilet-abai,–∞–¥–≤–æ–∫–∞—Ç—Ç—ã“õ “õ—ã–∑–º–µ—Ç,–∞–¥–≤–æ–∫–∞—Ç—Ç—ã“õ “õ—ã–∑–º–µ—Ç ‚Äì –∞–¥–≤–æ–∫–∞—Ç—Ç–∞—Ä –∂–µ–∫–µ –∂”ô–Ω–µ –∑–∞“£–¥—ã...,https://www.gov.kz/memleket/entities/adilet-ab...
99998,102882,124183,ru,abay-tabigat,–∑–∞—è–≤–ª–µ–Ω–∏–µ –æ –Ω–∞–º–µ—á–∞–µ–º–æ–π –¥–µ—è—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ –ø–æ –æ–±—ä–µ–∫—Ç—É...,–∑–∞–º–µ—á–∞–Ω–∏—è –∏ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –ø—Ä–∏–Ω–∏–º–∞—é—Ç—Å—è –≤ —Å—Ä–æ–∫ –¥–æ ...,https://www.gov.kz/memleket/entities/abay-tabi...


## Life

In [19]:
df_life = pd.read_csv('case1-datasaur/epir_train/life_situations.csv')


In [20]:
rows_to_keep = []

for index, row in tqdm(df_life.iterrows(), total=len(df_life)):
    intro_lang = None
    try:
        intro_lang = identify_language(row['intro'])
    except:
        pass

    # Condition to keep the row
    if row['sys_lang'] == 'en':
        if intro_lang == 'en':
            rows_to_keep.append(index)
    else:
        rows_to_keep.append(index)
        
df_life = df_life.loc[rows_to_keep]


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4408/4408 [00:00<00:00, 12883.42it/s]


In [21]:
df_life = df_life[df_life['intro'].notna()]
df_life = df_life[df_life['title_main'].notna()]
df_life = df_life[df_life['title_main'].notna()]
df_life = df_life[df_life['instruction'].notna()]

df_life['intro'] = df_life['intro'].str.lower()
df_life['title_main'] = df_life['title_main'].str.lower()
df_life['title_main'] = df_life['title_main'].str.lower()
df_life['instruction'] = df_life['instruction'].str.lower()


In [22]:
df_life['text1'] = df_life['title_main'] + " " + df_life['title_sub']
df_life['text2'] = df_life['intro'] + " " + df_life['instruction']
df_life.drop(columns=['intro', 'title_main', 'title_sub', 'instruction', 'subid'], inplace=True)
df_life.rename(columns={'URL': 'url'}, inplace=True)
df_life


Unnamed: 0.1,Unnamed: 0,id,sys_lang,url,text1,text2
20,202904,62,ru,https://beta2.egov.kz/situations/62/245?lang=ru,–∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç—å –ü—Ä–æ—Ö–æ–∂–¥–µ–Ω–∏–µ –º–µ–¥–∏–∫–æ...,"–∏–Ω–≤–∞–ª–∏–¥—ã ‚Äì —ç—Ç–æ –ª–∏—Ü–∞, –∫–æ—Ç–æ—Ä—ã–µ –∏–∑-–∑–∞ —Å—Ç–æ–π–∫–∏—Ö —Ä–∞—Å..."
21,202905,62,ru,https://beta2.egov.kz/situations/62/246?lang=ru,–∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç—å –ù–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –¥–æ–∫—É–º–µ...,"–∏–Ω–≤–∞–ª–∏–¥—ã ‚Äì —ç—Ç–æ –ª–∏—Ü–∞, –∫–æ—Ç–æ—Ä—ã–µ –∏–∑-–∑–∞ —Å—Ç–æ–π–∫–∏—Ö —Ä–∞—Å..."
22,202906,62,ru,https://beta2.egov.kz/situations/62/248?lang=ru,–∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç—å –ö—Ä–∏—Ç–µ—Ä–∏–∏ —É—Å—Ç–∞–Ω–æ–≤–ª–µ...,"–∏–Ω–≤–∞–ª–∏–¥—ã ‚Äì —ç—Ç–æ –ª–∏—Ü–∞, –∫–æ—Ç–æ—Ä—ã–µ –∏–∑-–∑–∞ —Å—Ç–æ–π–∫–∏—Ö —Ä–∞—Å..."
23,202907,62,ru,https://beta2.egov.kz/situations/62/247?lang=ru,–∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç—å –£—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏–µ –∏–Ω–≤–∞–ª...,"–∏–Ω–≤–∞–ª–∏–¥—ã ‚Äì —ç—Ç–æ –ª–∏—Ü–∞, –∫–æ—Ç–æ—Ä—ã–µ –∏–∑-–∑–∞ —Å—Ç–æ–π–∫–∏—Ö —Ä–∞—Å..."
24,202908,62,ru,https://beta2.egov.kz/situations/62/249?lang=ru,–∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç—å –°—Ä–æ–∫–∏ –∏–Ω–≤–∞–ª–∏–¥–Ω–æ—Å—Ç–∏,"–∏–Ω–≤–∞–ª–∏–¥—ã ‚Äì —ç—Ç–æ –ª–∏—Ü–∞, –∫–æ—Ç–æ—Ä—ã–µ –∏–∑-–∑–∞ —Å—Ç–æ–π–∫–∏—Ö —Ä–∞—Å..."
...,...,...,...,...,...,...
4403,207287,225,kk,https://beta2.egov.kz/situations/225/618?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4404,207288,225,kk,https://beta2.egov.kz/situations/225/619?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4405,207289,225,kk,https://beta2.egov.kz/situations/225/621?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4406,207290,225,kk,https://beta2.egov.kz/situations/225/623?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...


# News

In [44]:
df_news = pd.read_csv('case1-datasaur/epir_train/news.csv')
df_news


Unnamed: 0.1,Unnamed: 0,id,sys_lang,projects,title,body,short_description,url
0,102884,117143,qq,mangystau,MA≈É«¥YSTA√ùDA I Zh√ÅNE II DE≈ÉGEILI INFEKTsIIaLYQ ...,Ma≈Ñ«µysta√Ωda 4 qarashada«µy zha«µdai«µa s√°ikes I-I...,,https://www.gov.kz/memleket/entities/mangystau...
1,102885,432491,ru,vko-glubokoe,–ß—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –∑–Ω–∞—Ç—å –∏–∑–±–∏—Ä–∞—Ç–µ–ª—é?,20 –Ω–æ—è–±—Ä—è 2022 –≥–æ–¥–∞ —Å–æ—Å—Ç–æ–∏—Ç—Å—è –≤–∞–∂–Ω–æ–µ —Å–æ–±—ã—Ç–∏–µ –≤...,,https://www.gov.kz/memleket/entities/vko-glubo...
2,102886,563257,qq,turkestan-shardarin-audany,F√ùTBOL ZhARYSY UIYMDASTYRYLDY.,Zhastar arasynda sala√Ωatty √≥mir saltyn qalypta...,,https://www.gov.kz/memleket/entities/turkestan...
3,102887,591387,ru,mvd-almaty,–î–ü –ê–ª–º–∞—Ç—ã: –µ–∂–µ–¥–Ω–µ–≤–Ω–æ –æ–±—â–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –ø–æ—Ä—è–¥–æ–∫ –≤ –≥–æ...,"–ö–∞–∫ —Ä–∞–Ω–µ–µ —Å–æ–æ–±—â–∞–ª–æ—Å—å, –ø—Ä–µ—Å—Ç—É–ø–Ω–æ—Å—Ç—å –≤ –ê–ª–º–∞—Ç—ã —Å–Ω...",,https://www.gov.kz/memleket/entities/mvd-almat...
4,102888,312155,kk,almaty,–ë. –°–∞“ì—ã–Ω—Ç–∞–µ–≤ “∞“ö–®“∞ “∞–∂—ã–º–¥—ã“õ –±—ñ—Ç—ñ–º–≥–µ—Ä—à—ñ–ª—ñ–∫ –∫“Ø—à—Ç–µ—Ä...,–ê–ª–º–∞—Ç—ã “õ–∞–ª–∞—Å—ã–Ω–¥–∞ “∞–∂—ã–º–¥—ã“õ “õ–∞—É—ñ–ø—Å—ñ–∑–¥—ñ–∫ —Ç—É—Ä–∞–ª—ã —à–∞...,,https://www.gov.kz/memleket/entities/almaty/pr...
...,...,...,...,...,...,...,...,...
99995,202879,367308,kk,borodulihamaslikhat,"–ê—É–¥–∞–Ω–¥—ã“õ –º”ô—Å–ª–∏—Ö–∞—Ç—Ç—ã“£ —Ö–∞—Ç—à—ã—Å—ã, ¬´–ê–º–∞–Ω–∞—Ç¬ª –ø–∞—Ä—Ç–∏—è—Å...","–ê—É–¥–∞–Ω–¥—ã“õ –º”ô—Å–ª–∏—Ö–∞—Ç—Ç—ã“£ —Ö–∞—Ç—à—ã—Å—ã, ¬´–ê–º–∞–Ω–∞—Ç¬ª –ø–∞—Ä—Ç–∏—è—Å...",,https://www.gov.kz/memleket/entities/borodulih...
99996,202880,589811,qq,maslihat-almaty,Almaty qalasy M√°slihaty dep√Ωtattaryny≈Ñ qo«µamdy...,Qurmetti almatylyqtar!‚†Äüîπ Nazarlary≈Ñyz«µa #Masli...,,https://www.gov.kz/memleket/entities/maslihat-...
99997,202881,259111,en,mfa-bratislava,Strengthening scientific research cooperation ...,"ZVOLEN, September 21, 2021 ‚Äì As part of streng...",,https://www.gov.kz/memleket/entities/mfa-brati...
99998,202882,592904,ru,abay-bilim,–°–æ–æ–±—â–∞—Ç—å –æ –∫–æ—Ä—Ä—É–ø—Ü–∏–∏ –æ—á–µ–Ω—å –≤–∞–∂–Ω–æ,–ê–Ω—Ç–∏–∫–æ—Ä—Ä—É–ø—Ü–∏–æ–Ω–Ω–æ–π —Å—Ç—Ä–∞—Ç–µ–≥–∏–µ–π –†–ö –Ω–∞ 2015-2025 –≥...,,https://www.gov.kz/memleket/entities/abay-bili...


In [45]:
rows_to_keep = []

for index, row in tqdm(df_news.iterrows(), total=len(df_news)):
    title_lang = content_lang = None
    try:
        title_lang = identify_language(row['title'])
    except:
        pass

    # Condition to keep the row
    if row['sys_lang'] == 'en':
        if title_lang == 'en':
            rows_to_keep.append(index)
    else:
        rows_to_keep.append(index)        


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100000/100000 [00:05<00:00, 19176.69it/s]


In [46]:
df_news = df_news.loc[rows_to_keep]

In [47]:
df_news = df_news[df_news['title'].notna()]
df_news = df_news[df_news['body'].notna()]

df_news['title'] = df_news['title'].str.lower()
df_news['body'] = df_news['body'].str.lower()

new_title = []
new_content = []
for i, row in tqdm(df_news.iterrows(), total=len(df_news)):
    if row['sys_lang'] == 'qq':
        new_title.append(text_to_cyrillic(row['title']))
        new_content.append(text_to_cyrillic(row['body']))
    else:
        new_title.append(row['title'])
        new_content.append(row['body'])


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 95021/95021 [00:57<00:00, 1664.46it/s]


In [48]:
df_news['title'] = new_title
df_news['body'] = new_content


In [51]:
df_news.drop(columns=['short_description', 'projects'], inplace=True)


In [52]:
df_news

Unnamed: 0.1,Unnamed: 0,id,sys_lang,title,body,url
0,102884,117143,qq,–º–∞“£“ì—ã—Å—Ç–∞—É–¥–∞ —ñ –∑—Ö”ô–Ω–µ —ñ—ñ –¥–µ“£–≥–µ—ñ–ª—ñ —ñ–Ω—Ñ–µ–∫—Ç—Å—ñ—ñ–∞–ª—ã“õ ...,–º–∞“£“ì—ã—Å—Ç–∞—É–¥–∞ 4 “õ–∞—Ä–∞—à–∞–¥–∞“ì—ã –∑—Ö–∞“ì–¥–∞—ñ“ì–∞ —Å”ô—ñ–∫–µ—Å —ñ-—ñ—ñ...,https://www.gov.kz/memleket/entities/mangystau...
1,102885,432491,ru,—á—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –∑–Ω–∞—Ç—å –∏–∑–±–∏—Ä–∞—Ç–µ–ª—é?,20 –Ω–æ—è–±—Ä—è 2022 –≥–æ–¥–∞ —Å–æ—Å—Ç–æ–∏—Ç—Å—è –≤–∞–∂–Ω–æ–µ —Å–æ–±—ã—Ç–∏–µ –≤...,https://www.gov.kz/memleket/entities/vko-glubo...
2,102886,563257,qq,—Ñ—É—Ç–±–æ–ª –∑—Ö–∞—Ä—ã—Å—ã “±—ñ—ã–º–¥–∞—Å—Ç—ã—Ä—ã–ª–¥—ã.,–∑—Ö–∞—Å—Ç–∞—Ä –∞—Ä–∞—Å—ã–Ω–¥–∞ —Å–∞–ª–∞—É–∞—Ç—Ç—ã ”©–º—ñ—Ä —Å–∞–ª—Ç—ã–Ω “õ–∞–ª—ã–ø—Ç–∞...,https://www.gov.kz/memleket/entities/turkestan...
3,102887,591387,ru,–¥–ø –∞–ª–º–∞—Ç—ã: –µ–∂–µ–¥–Ω–µ–≤–Ω–æ –æ–±—â–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –ø–æ—Ä—è–¥–æ–∫ –≤ –≥–æ...,"–∫–∞–∫ —Ä–∞–Ω–µ–µ —Å–æ–æ–±—â–∞–ª–æ—Å—å, –ø—Ä–µ—Å—Ç—É–ø–Ω–æ—Å—Ç—å –≤ –∞–ª–º–∞—Ç—ã —Å–Ω...",https://www.gov.kz/memleket/entities/mvd-almat...
4,102888,312155,kk,–±. —Å–∞“ì—ã–Ω—Ç–∞–µ–≤ “±“õ—à“± “±–∂—ã–º–¥—ã“õ –±—ñ—Ç—ñ–º–≥–µ—Ä—à—ñ–ª—ñ–∫ –∫“Ø—à—Ç–µ—Ä...,–∞–ª–º–∞—Ç—ã “õ–∞–ª–∞—Å—ã–Ω–¥–∞ “±–∂—ã–º–¥—ã“õ “õ–∞—É—ñ–ø—Å—ñ–∑–¥—ñ–∫ —Ç—É—Ä–∞–ª—ã —à–∞...,https://www.gov.kz/memleket/entities/almaty/pr...
...,...,...,...,...,...,...
99995,202879,367308,kk,"–∞—É–¥–∞–Ω–¥—ã“õ –º”ô—Å–ª–∏—Ö–∞—Ç—Ç—ã“£ —Ö–∞—Ç—à—ã—Å—ã, ¬´–∞–º–∞–Ω–∞—Ç¬ª –ø–∞—Ä—Ç–∏—è—Å...","–∞—É–¥–∞–Ω–¥—ã“õ –º”ô—Å–ª–∏—Ö–∞—Ç—Ç—ã“£ —Ö–∞—Ç—à—ã—Å—ã, ¬´–∞–º–∞–Ω–∞—Ç¬ª –ø–∞—Ä—Ç–∏—è—Å...",https://www.gov.kz/memleket/entities/borodulih...
99996,202880,589811,qq,–∞–ª–º–∞—Ç—ã “õ–∞–ª–∞—Å—ã –º”ô—Å–ª—ñ—Ö–∞—Ç—ã –¥–µ–ø—É—Ç–∞—Ç—Ç–∞—Ä—ã–Ω—ã“£ “õ–æ“ì–∞–º–¥—ã...,“õ“±—Ä–º–µ—Ç—Ç—ñ –∞–ª–º–∞—Ç—ã–ª—ã“õ—Ç–∞—Ä!‚†Äüîπ –Ω–∞–∑–∞—Ä–ª–∞—Ä—ã“£—ã–∑“ì–∞ #–º–∞—Å–ª—ñ...,https://www.gov.kz/memleket/entities/maslihat-...
99997,202881,259111,en,strengthening scientific research cooperation ...,"zvolen, september 21, 2021 ‚Äì as part of streng...",https://www.gov.kz/memleket/entities/mfa-brati...
99998,202882,592904,ru,—Å–æ–æ–±—â–∞—Ç—å –æ –∫–æ—Ä—Ä—É–ø—Ü–∏–∏ –æ—á–µ–Ω—å –≤–∞–∂–Ω–æ,–∞–Ω—Ç–∏–∫–æ—Ä—Ä—É–ø—Ü–∏–æ–Ω–Ω–æ–π —Å—Ç—Ä–∞—Ç–µ–≥–∏–µ–π —Ä–∫ –Ω–∞ 2015-2025 –≥...,https://www.gov.kz/memleket/entities/abay-bili...


In [54]:
df_news = df_news[df_news['sys_lang'].isin(['en', 'kk', 'qq', 'ru'])]

In [58]:
df_articles.rename(columns={'title': 'text1', 'content': 'text2'}, inplace=True)

In [61]:
df_services = pd.read_csv('case1-datasaur/epir_train/services.csv')
df_services


Unnamed: 0.1,Unnamed: 0,id,sys_lang,additional_info,description,full_title,short_title,title,result_description,url
0,0,3087,en,Rules for providing state services,Dear citizens of the Republic of Kazakhstan! I...,"Issuance of passports, Identification cards fo...","Obtainment of passport, national ID","Obtainment of passport, national ID",Issuance of passport and (or) national ID of a...,https://beta2.egov.kz/services/3087?lang=en
1,1,3081,en,Rules for providing state services,Information is being updated. Citizens of the ...,Acceptance of documents for issuance of passpo...,Acceptance of documents for issuance of passpo...,Acceptance of documents for issuance of passpo...,Passport of a citizen of the Republic of Kazak...,https://beta2.egov.kz/services/3081?lang=en
2,2,4632,en,,,P250.00,P250.00 —Ç–µ—Å—Ç 996,P250.00,,https://beta2.egov.kz/services/4632?lang=en
3,3,4617,en,,,CR.01,CR.01,CR.01,,https://beta2.egov.kz/services/4617?lang=en
4,4,3811,en,,Information is being updated. What is Personal...,Request for Personal Record,Request for Personal Record,Request for Personal Record,"Personal Record from ""e-Kyzmet"" IIS.",https://beta2.egov.kz/services/3811?lang=en
...,...,...,...,...,...,...,...,...,...,...
2879,2879,4732,ru,,,P7.05,P7.05,P7.05,,https://beta2.egov.kz/services/4732?lang=ru
2880,2880,4733,ru,,,P3.03,P3.03,P3.03,,https://beta2.egov.kz/services/4733?lang=ru
2881,2881,4735,ru,,,P3.041,P3.041,P3.041,,https://beta2.egov.kz/services/4735?lang=ru
2882,2882,4736,ru,,,P21.09,P21.09,P21.09,,https://beta2.egov.kz/services/4736?lang=ru


In [73]:
df_services = df_services[df_services['description'].notna()]
df_services = df_services[df_services['result_description'].notna()]
df_services = df_services[df_services['full_title'].notna()]
df_services = df_services[df_services['short_title'].notna()]
df_services = df_services[df_services['title'].notna()]
df_services = df_services[df_services['additional_info'].notna()]


In [74]:
df_services['description'] = df_services['description'].str.lower()
df_services['result_description'] = df_services['result_description'].str.lower()
df_services['full_title'] = df_services['full_title'].str.lower()
df_services['short_title'] = df_services['short_title'].str.lower()
df_services['title'] = df_services['title'].str.lower()
df_services['additional_info'] = df_services['additional_info'].str.lower()


In [75]:
rows_to_keep = []

for index, row in tqdm(df_services.iterrows(), total=len(df_services)):
    title_lang = None
    try:
        title_lang = identify_language(row['full_title'])
    except:
        pass

    # Condition to keep the row
    if row['sys_lang'] == 'en':
        if title_lang == 'en':
            rows_to_keep.append(index)
    else:
        rows_to_keep.append(index)
        

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2326/2326 [00:00<00:00, 18207.35it/s]


In [76]:
df_services = df_services.loc[rows_to_keep]

In [79]:
df_services['text1'] = df_services['additional_info'] + " " + df_services['description'] + " " + df_services['result_description']
df_services['text2'] = df_services['full_title'] + " " + df_services['short_title'] + " " + df_services['title']

df_services.drop(columns=['additional_info', 'description', 'full_title', 'short_title', 'title', 'result_description'], inplace=True)


In [80]:
df_services

Unnamed: 0.1,Unnamed: 0,id,sys_lang,url,text1,text2
0,0,3087,en,https://beta2.egov.kz/services/3087?lang=en,rules for providing state services dear citize...,"issuance of passports, identification cards fo..."
1,1,3081,en,https://beta2.egov.kz/services/3081?lang=en,rules for providing state services information...,acceptance of documents for issuance of passpo...
5,5,3263,en,https://beta2.egov.kz/services/3263?lang=en,rules for providing state services information...,obtaining a certificate on the status of the i...
6,6,3040,en,https://beta2.egov.kz/services/3040?lang=en,rules for providing state services dear citize...,removal from registration at the place of resi...
7,7,3759,en,https://beta2.egov.kz/services/3759?lang=en,rules for providing state service what is temp...,issuance of a list on temporary work incapacit...
...,...,...,...,...,...,...
2799,2799,3129,kk,https://beta2.egov.kz/services/3129?lang=kk,“õ—ã–∑–º–µ—Ç —Å—Ç–∞–Ω–¥–∞—Ä—Ç—ã –∞“õ–ø–∞—Ä–∞—Ç ”©–∑–µ–∫—Ç–µ–Ω–¥—ñ—Ä—É —Å–∞—Ç—ã—Å—ã–Ω–¥–∞...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã–Ω–¥–∞ —É–∞“õ—ã—Ç—à–∞ –±–æ–ª–∞—Ç—ã–Ω —à–µ—Ç–µ...
2802,2802,3140,qq,https://beta2.egov.kz/services/3140?lang=qq,memlekettik qyzmet k√≥rset√Ω erezhesi aqparat √≥z...,qazaqstan resp√Ωblikasynda bosqyn m√°rtebesin be...
2803,2803,3038,qq,https://beta2.egov.kz/services/3038?lang=qq,memlekettik qyzmet k√≥rset√Ω erezhesi qurmetti q...,qazaqstan resp√Ωblikasyny≈Ñ halqyn tur«µylyqty zh...
2867,2867,3570,qq,https://beta2.egov.kz/services/3570?lang=qq,memlekettik qyzmet k√≥rset√Ω erezhesi qurmetti q...,investitsiialardy zh√∫zege asyr√Ωdy zh√°ne invest...


In [83]:
df_news.rename(columns={'title': 'text1', 'body': 'text2'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_news.rename(columns={'title': 'text1', 'body': 'text2'}, inplace=True)


In [88]:
df = pd.concat([df_services, df_articles, df_news, df_life])

In [91]:
df.drop(['projects'], axis=1, inplace=True)

In [93]:
df

Unnamed: 0.1,Unnamed: 0,id,sys_lang,url,text1,text2
0,0,3087,en,https://beta2.egov.kz/services/3087?lang=en,rules for providing state services dear citize...,"issuance of passports, identification cards fo..."
1,1,3081,en,https://beta2.egov.kz/services/3081?lang=en,rules for providing state services information...,acceptance of documents for issuance of passpo...
5,5,3263,en,https://beta2.egov.kz/services/3263?lang=en,rules for providing state services information...,obtaining a certificate on the status of the i...
6,6,3040,en,https://beta2.egov.kz/services/3040?lang=en,rules for providing state services dear citize...,removal from registration at the place of resi...
7,7,3759,en,https://beta2.egov.kz/services/3759?lang=en,rules for providing state service what is temp...,issuance of a list on temporary work incapacit...
...,...,...,...,...,...,...
4403,207287,225,kk,https://beta2.egov.kz/situations/225/618?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4404,207288,225,kk,https://beta2.egov.kz/situations/225/619?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4405,207289,225,kk,https://beta2.egov.kz/situations/225/621?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
4406,207290,225,kk,https://beta2.egov.kz/situations/225/623?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...


In [97]:
df.to_csv('df.csv', index=0)

In [98]:
pd.read_csv('df.csv')

Unnamed: 0.1,Unnamed: 0,id,sys_lang,url,text1,text2
0,0,3087,en,https://beta2.egov.kz/services/3087?lang=en,rules for providing state services dear citize...,"issuance of passports, identification cards fo..."
1,1,3081,en,https://beta2.egov.kz/services/3081?lang=en,rules for providing state services information...,acceptance of documents for issuance of passpo...
2,5,3263,en,https://beta2.egov.kz/services/3263?lang=en,rules for providing state services information...,obtaining a certificate on the status of the i...
3,6,3040,en,https://beta2.egov.kz/services/3040?lang=en,rules for providing state services dear citize...,removal from registration at the place of resi...
4,7,3759,en,https://beta2.egov.kz/services/3759?lang=en,rules for providing state service what is temp...,issuance of a list on temporary work incapacit...
...,...,...,...,...,...,...
196129,207287,225,kk,https://beta2.egov.kz/situations/225/618?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
196130,207288,225,kk,https://beta2.egov.kz/situations/225/619?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
196131,207289,225,kk,https://beta2.egov.kz/situations/225/621?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
196132,207290,225,kk,https://beta2.egov.kz/situations/225/623?lang=kk,“õ—Ä –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ–º–ª–µ–∫–µ—Ç—Ç—ñ–∫ –±–∞—Å“õ–∞—Ä—É –∞–∫...,“õ–∞–∑–∞“õ—Å—Ç–∞–Ω —Ä–µ—Å–ø—É–±–ª–∏–∫–∞—Å—ã –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ñ –∂–∞–Ω—ã–Ω–¥–∞“ì—ã –º–µ...
