In [1]:
from preprocess.util.iterate_files import iterate_files
import os.path as path
from collections import Counter

In [2]:
def get_data_info(dir, n):
    n_events = 0
    locations = []
    langs = {}
    counts = {}
    
    def add_entry(d, key, value):
        if key in d:
            d[key] += value
        else:
            d[key] = value
    
    def get_info(file_name, df):
        nonlocal n_events, locations, langs, counts
        
        n_events += len(df)
        
        for index, row in df.iterrows():
            articleCounts = row['info']['articleCounts']
            multiling = row['info']['multiLingInfo']
            location = row['info']['location'] if 'location' in row['info'] else None
            
            if location is not None:
                if 'labelEng' in location:
                    locations.append(location['labelEng'])
                else:
                    print(location)
            
            for lang in articleCounts.keys():
                add_entry(counts, lang, articleCounts[lang])
                
            for lang in multiling.keys():
                add_entry(langs, lang, 1)
            
    iterate_files(get_info, dir, n)
    
    locations_counts = Counter(locations)
    
    return {
        'n_events': n_events,
        'locations': locations_counts,
        'langs': langs,
        'counts': counts
    }

In [3]:
dir_path = path.abspath("../../data/preprocessed")
data = get_data_info(dir_path, 3000)

Processing:   3%|█▋                                              | 102/2944 [00:03<01:43, 27.58it/s]


KeyboardInterrupt: 

In [97]:
n_events, locations, langs, article_counts = data['n_events'], data['locations'], data['langs'], data['counts']

In [68]:
n_events

2632115

In [93]:
# get location distribution
most_common = locations.most_common(100)
total_events = sum(locations.values())
total_per = 0
for i in range(len(most_common)):
    name, counts = most_common[i]
    print(f"{name}: {counts/total_events*100:.2f}%")
    total_per += counts/total_events*100
    
print("-----------")
print(f"Total: {total_per:.2f}%")

United States: 24.59%
Germany: 10.67%
Spain: 7.80%
United Kingdom: 5.56%
Canada: 2.46%
India: 2.27%
France: 2.09%
Russia: 2.04%
Mexico: 1.75%
Austria: 1.73%
Australia: 1.71%
Italy: 1.61%
China: 1.49%
Argentina: 1.44%
Switzerland: 1.26%
Japan: 1.14%
Brazil: 1.06%
Venezuela: 1.00%
Israel: 0.83%
Chile: 0.80%
Belgium: 0.59%
Syria: 0.51%
South Africa: 0.51%
Peru: 0.49%
Pakistan: 0.48%
Greece: 0.47%
Colombia: 0.45%
Nigeria: 0.43%
Hong Kong: 0.41%
New Zealand: 0.41%
Kiev: 0.37%
Republic of Ireland: 0.35%
Ukraine: 0.34%
Cuba: 0.34%
Turkey: 0.32%
Philippines: 0.32%
Taiwan: 0.32%
United Arab Emirates: 0.30%
Singapore: 0.30%
Iran: 0.29%
Portugal: 0.29%
Kenya: 0.29%
Bolivia: 0.28%
Malaysia: 0.28%
Egypt: 0.27%
Mesopotamia: 0.27%
Netherlands: 0.26%
Thailand: 0.24%
Palestine: 0.22%
Saudi Arabia: 0.21%
Dominican Republic: 0.21%
Afghanistan: 0.20%
London: 0.20%
Sweden: 0.19%
Lebanon: 0.19%
Uruguay: 0.19%
Korea: 0.19%
Ecuador: 0.18%
North Korea: 0.17%
Beijing: 0.16%
Serbia: 0.16%
Vatican City: 0.16%
Lib

In [70]:
langs, len(langs)

({'eng': 1499144,
  'spa': 483132,
  'zho': 174710,
  'deu': 406045,
  'slv': 13099,
  'ita': 13362,
  'hrv': 7874,
  'rus': 25075,
  'fra': 32509,
  'por': 14514,
  'cat': 1312,
  'tur': 5760,
  'ara': 9776,
  'srp': 2068},
 14)

In [94]:
total_langs = sum(langs.values())
for lang, count in langs.items():
    print(f"{lang}: {count/total_langs*100:.2f}%")

eng: 55.76%
spa: 17.97%
zho: 6.50%
deu: 15.10%
slv: 0.49%
ita: 0.50%
hrv: 0.29%
rus: 0.93%
fra: 1.21%
por: 0.54%
cat: 0.05%
tur: 0.21%
ara: 0.36%
srp: 0.08%


In [99]:
article_counts

{'total': 33549078,
 'eng': 22264068,
 'spa': 4404404,
 'zho': 1701025,
 'deu': 3940879,
 'slv': 113851,
 'ita': 128387,
 'hrv': 70323,
 'rus': 243206,
 'fra': 370115,
 'por': 142587,
 'cat': 12125,
 'tur': 52789,
 'ara': 85793,
 'srp': 19526}

In [100]:
total_counts = sum(article_counts.values()) - article_counts['total']
for lang, count in article_counts.items():
    print(f"{lang}: {count/total_counts*100:.2f}%")

total: 100.00%
eng: 66.36%
spa: 13.13%
zho: 5.07%
deu: 11.75%
slv: 0.34%
ita: 0.38%
hrv: 0.21%
rus: 0.72%
fra: 1.10%
por: 0.43%
cat: 0.04%
tur: 0.16%
ara: 0.26%
srp: 0.06%
