# Prototype : request, scrap data from one url

In [1]:
import requests # 어디에서-가져올까 패키지
import bs4
import pandas as pd
import time
import re
import json
from fake_useragent import UserAgent

In [2]:
vowels = "аэиоуяеыёю"
def new_agent() -> dict :
    ua = UserAgent()
    header = {
        'User-Agent': ua.random,
    }
    return header

def new_proxy() -> dict :
    proxy_url = "https://gimmeproxy.com/api/getProxy"
    while True:
        result = requests.get(proxy_url)
        print(result.status_code)
        if result.status_code != 200:
            continue
        random_proxy_byte_str = result.content
        random_proxy_dict = json.loads(random_proxy_byte_str)
        proxy_url = random_proxy_dict['curl'].replace("<br>", "")
        proxy = {'http' : proxy_url, 'https' : proxy_url}
        return proxy

def check_vowel(s : list) -> int :
    count = 0
    for c in s:
        if c in vowels:
            count += 1
    return count

# assumption : contents look like str + bs4.element.Tag + str (order doesn't matter)
def parse_forma(soup : bs4.BeautifulSoup) -> tuple : 
    tag = soup.find('strong')
    contents = tag.contents
    full_word = ""
    stress = 0
    vowel_count = 0 
    for content in contents:
        if type(content) == bs4.element.NavigableString:
            vowel_count += check_vowel(content.string)
        elif type(content) == bs4.element.Tag:
            stress = vowel_count + 1
        else:
            return pd.Series({'stress' : -1, 'word_with_accent' : "PARSE_ERROR"})
        full_word += content.string

    return pd.Series({'stress' : stress, 'word_with_accent' : full_word})
    
def get_stress_and_fullword(word : str, collected_dict : dict, header : dict, proxy : dict) :
    if word in collected_dict:
        if collected_dict[word]['word_with_accent'] not in ["CONNECTION_FAIL", "DETECTED", "REQUEST FAIL"]:
            return True, collected_dict[word]

    url = f'https://udarenieru.ru/index.php?forma={word}'
    try:
        result = requests.get(url, headers=header, proxies=proxy)
    except requests.exceptions.ConnectionError:
        collected_dict[word] = pd.Series({'stress' : -1, 'word_with_accent' : "CONNECTION_FAIL"})
        print("Connection refused")
        return False, collected_dict[word]

    if result.status_code == 403:
        collected_dict[word] = pd.Series({'stress' : -1, 'word_with_accent' : "DETECTED"})
        return False, collected_dict[word]
    elif result.status_code != 200:
        collected_dict[word] = pd.Series({'stress' : -1, 'word_with_accent' : "NOT_FOUND"})
        return False, collected_dict[word]

    soup = bs4.BeautifulSoup(result.content, 'html.parser')
    collected_dict[word] = parse_forma(soup)
    return False, collected_dict[word]

def find_pure_word(text : str) -> str :
    found = re.sub('[a-z]*_', '', text)
    return found

In [27]:
target_df = pd.read_csv('mxnt_token.csv', names = ['name', 'name2', 'freq'])

In [28]:
target_df.head(10)

Unnamed: 0,name,name2,freq
0,v_бдении,vo_бдении,2
1,s_бдениями,s_бдениями,1
2,v_бдениях,v_бдениях,1
3,v_бдительности,v_бдительности,2
4,k_бдительности,k_бдительности,21
5,s_бдительностью,s_бдительностью,2
6,k_бжания,k_бжания,1
7,v_бжезинке,v_бжезинке,1
8,k_бзикам,k_бзикам,1
9,s_бзиками,s_бзиками,1


In [29]:
word_df = target_df['name2']
word_df.head(10)

0          vo_бдении
1         s_бдениями
2          v_бдениях
3     v_бдительности
4     k_бдительности
5    s_бдительностью
6           k_бжания
7         v_бжезинке
8           k_бзикам
9          s_бзиками
Name: name2, dtype: object

In [30]:
collected_df = pd.read_csv("data.csv")
collected_df = collected_df.set_index('word')
collected_dict = collected_df.to_dict('index')
collected_df.head(10)

Unnamed: 0_level_0,stress,word_with_accent
word,Unnamed: 1_level_1,Unnamed: 2_level_1
бдении,1,Бде́нии
бдениями,1,Бде́ниями
бдениях,1,Бде́ниях
бдительности,1,Бди́тельности
бдительностью,1,Бди́тельностью
бжания,-1,NOT_FOUND
бжезинке,-1,NOT_FOUND
бзикам,1,Бзи́кам
бзиками,1,Бзи́ками
блаватником,-1,NOT_FOUND


In [36]:
print('Initialization (proxy, header)...')
my_proxy = new_proxy()
my_agent = new_agent()

print('Start scrap...')
connection_fail = 0
for i, w in enumerate(word_df):
    pure_w = find_pure_word(str(w))
    cached, word_dict = get_stress_and_fullword(pure_w, collected_dict, my_agent, my_proxy)
    stress = word_dict['stress']
    word_with_accent = word_dict['word_with_accent']
    if cached:
        print(f'{i}th iteration... word : {pure_w} (cached)')    
    else:
        print(f'{i}th iteration... word : {pure_w}, stress : {stress}, full : {word_with_accent}')
    if word_with_accent in ["CONNECTION_FAIL"]:
        connection_fail += 1
    if word_with_accent in ["DETECTED"] or connection_fail > 50:
        print('Find a new proxy...')
        my_proxy = new_proxy()
        connection_fail = 0

о, stress : -1, full : NOT_FOUND
5225th iteration... word : джо (cached)
5226th iteration... word : джо (cached)
5227th iteration... word : джоан, stress : -1, full : NOT_FOUND
5228th iteration... word : джоан (cached)
5229th iteration... word : джобса, stress : -1, full : NOT_FOUND
5230th iteration... word : джобсом, stress : -1, full : NOT_FOUND
5231th iteration... word : джованни, stress : -1, full : NOT_FOUND
5232th iteration... word : джованной, stress : -1, full : NOT_FOUND
5233th iteration... word : джоди, stress : -1, full : NOT_FOUND
5234th iteration... word : джоем, stress : -1, full : NOT_FOUND
5235th iteration... word : джозефе, stress : 1, full : Джо́зефе
5236th iteration... word : джозефину, stress : -1, full : NOT_FOUND
5237th iteration... word : джозефом, stress : 1, full : Джо́зефом
5238th iteration... word : джозефсоновских, stress : -1, full : NOT_FOUND
5239th iteration... word : джозефсоновской, stress : -1, full : NOT_FOUND
5240th iteration... word : джозефсоновско

KeyboardInterrupt: 

In [37]:
new_collected_df = pd.DataFrame.from_dict(collected_dict, orient='index', columns=['stress', 'word_with_accent'])
new_collected_df = new_collected_df.reset_index()
new_collected_df.columns = ['word', 'stress', 'word_with_accent']
new_collected_df.head(20)                                                                                                                                                                                                                                                                                                   

Unnamed: 0,word,stress,word_with_accent
0,бдении,1,Бде́нии
1,бдениями,1,Бде́ниями
2,бдениях,1,Бде́ниях
3,бдительности,1,Бди́тельности
4,бдительностью,1,Бди́тельностью
5,бжания,-1,NOT_FOUND
6,бжезинке,-1,NOT_FOUND
7,бзикам,1,Бзи́кам
8,бзиками,1,Бзи́ками
9,блаватником,-1,NOT_FOUND


In [38]:
new_collected_df.to_csv("data.csv", index=False)