In [1]:
import pandas as pd
import numpy as np
import datetime

import requests
from tqdm import tqdm
from urllib.parse import urlparse

import collections

In [2]:
from task1_utils import *
from task2_utils import *

In [3]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [4]:
dataset_name = 'InputData.xlsx'

ds = pd.read_excel(dataset_name, 'Fround',engine='openpyxl')

In [6]:
# download news page
ds['soup'] = ds['news_url'].apply(lambda x: soup_by_url(x)[0])

In [7]:
# extract text, convert text to set of sentences
ds['sents'] = ds['soup'].apply(lambda x: list_to_sents(all_strings(x)))

In [8]:
# extract sentences with digits
ds['sentswithdigits'] = ds['sents'].apply(lambda x: [sent for sent in x if isthere_dig(sent)])

In [9]:
# extract from sentswithdigits ones with currency symbols too
ds['sentswithsymbs'] = ds['sentswithdigits'].apply(lambda x: 
                                                   [sent for sent in x if 
                                                   isthere_cur_symb(' '.join(sent))|isthere_cur_keys(' '.join(sent))])

In [10]:
# and adding filter to including of company name
ds['sentswithname'] = ds.apply(lambda row: isthere_name(row['sentswithsymbs'],row['name']),axis=1)

In [None]:
# soft filter to including our marks
ds['sentsaboutmoney'] = ds.apply(lambda row: 
                            ' '.join(row['sentswithname'] or row['sentswithsymbs'] or row['sentswithdigits']),axis=1)

In [17]:
# looking for fround amount
# turn currency abbreviation to symbols
from currency_symbols import _constants
for k,v in _constants.CURRENCY_SYMBOLS_MAP.items():
    ds['sentsaboutmoney']  = ds['sentsaboutmoney'].str.replace(f' {k} ',f' {v} ',case=True)

# extract some text's parts look like an fround amount
ds['fround_amount'] = ds['sentsaboutmoney'].apply(lambda x: [x for x in extract_fround_amount(x)])

# soft currency symbols filter 
ds['fround_amount'] = ds['fround_amount'].apply(lambda lst: 
                            [x for x in lst if isthere_cur_symb(x)] if isthere_cur_symb(' '.join(lst)) else lst)

# most common example
ds['fround_amount'] = ds['fround_amount'].apply(lambda x:  collections.Counter(x).most_common(1))

# from tuple to str
ds['fround_amount'] = ds['fround_amount'].apply(lambda x: first_element(first_element(x)))

In [11]:
# loofing for a news date
# extract all 'time' tags
ds['times'] = ds['soup'].apply(lambda x: times(x))

# if news had time tag, take its, if not - take first 5 sentences with digits
ds['dates'] = ds.apply(lambda row: row['times'] or row['sentswithdigits'][:5],axis=1)

# trying to propose some dates from those texts
ds['props_dates'] = ds['dates'].apply(lambda lst: find_dates_from_list(lst))

# round dates to day
ds['props_dates'] = ds['props_dates'].apply(lambda x: 
                    [d.replace(hour = 0,minute=0, second=0, microsecond=0,tzinfo=None) for d in x] if x else None)

# filter from 2000 year to today, take first occurence
ds['date'] = ds['props_dates'].apply(lambda x: 
                    unique([d for d in x if d<datetime.datetime.now() and d.year>2000])[0] if x else None)


In [27]:
ds[['client_id','name','website','news_url','date','fround_amount']].to_excel('task2_frounds_dataset.xlsx', engine='openpyxl',index=False)