In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

%matplotlib inline



In [2]:
def dateparse(date) -> str:
    date = date.split("-")
    return "%s-%s" % (date[0], date[1])

In [3]:
data = pd.read_csv(
    "../data/kaggle/users.csv",
    parse_dates=["join_date"],
    date_parser=dateparse
)

In [4]:
data.head()

Unnamed: 0,city,country,join_date,twitter
0,,Japan,2011-02-01,smly
1,Moscow,Russian Federation,2014-12-01,
2,,,2014-08-01,
3,San Francisco,United States,2012-10-01,
4,,Israel,2012-11-01,


In [5]:
data.describe()

Unnamed: 0,city,country,join_date,twitter
count,1764,2693,3601,424
unique,551,85,89,423
top,Moscow,United States,2012-03-01 00:00:00,test
freq,127,852,86,2
first,,,2010-02-01 00:00:00,
last,,,2017-06-01 00:00:00,


In [6]:
sort_countries = data.groupby("country").size().to_frame("Total").sort_values("Total", ascending=False)

In [7]:
print(sort_countries.to_string())

                      Total
country                    
United States           852
Russian Federation      223
China                   149
India                   145
France                  125
United Kingdom          117
Germany                 106
Canada                   78
Australia                76
Japan                    74
Netherlands              59
Singapore                55
Spain                    52
Poland                   51
Ukraine                  40
Brazil                   34
Italy                    31
Israel                   28
Belgium                  27
Taiwan                   27
Switzerland              24
Sweden                   20
Finland                  18
Greece                   18
Portugal                 15
Czech Republic           14
Hungary                  13
South Africa             13
Belarus                  12
South Korea              11
Ireland                  10
Turkey                   10
New Zealand              10
Austria             

In [8]:
# get data about countries and population
import json

with open('data/kaggle/population.json') as data_file:    
    population = json.load(data_file)
    
population_dict = {}
for x in population:
    population_dict[x["country"]] = x["population"]


In [9]:

sort_countries["country"] = sort_countries.index.get_level_values('country') 
sort_countries.head()

Unnamed: 0_level_0,Total,country
country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,852,United States
Russian Federation,223,Russian Federation
China,149,China
India,145,India
France,125,France


In [10]:
# population_df = sort_countries.assign(
#     population=lambda x: print(x["country"][0]))
# population_df.head()

sort_countries["population"] = sort_countries["country"].map(population_dict).astype(int)
sort_countries.head()

Unnamed: 0_level_0,Total,country,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,852,United States,278357000
Russian Federation,223,Russian Federation,146934000
China,149,China,1277558000
India,145,India,1013662000
France,125,France,59225700


In [11]:
countries_population = sort_countries.assign(
     per_capita=lambda x: round(x["Total"] / x["population"] * 10000000, 2))
countries_population


Unnamed: 0_level_0,Total,country,population,per_capita
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
United States,852,United States,278357000,30.61
Russian Federation,223,Russian Federation,146934000,15.18
China,149,China,1277558000,1.17
India,145,India,1013662000,1.43
France,125,France,59225700,21.11
United Kingdom,117,United Kingdom,59623400,19.62
Germany,106,Germany,82164700,12.90
Canada,78,Canada,31147000,25.04
Australia,76,Australia,23840314,31.88
Japan,74,Japan,126714000,5.84


In [12]:
with open('data/kaggle/users_created.json', 'w') as f:
    f.write(countries_population.to_json(orient='records'))


In [61]:

def get_3_code_country(country):
    if (country == "South Korea"): 
        return "KOR"
    elif (country == "North Korea"):
        return "PRK"
    else:
        url = 'https://restcountries.eu/rest/v2/name/' + quote(country, safe='')
        r = requests.get(url)
        response = r.json()
        try:
            return response[0]['alpha3Code']
        except KeyError:
            return None


In [63]:
import requests
from urllib.parse import quote

data_javascript_map = countries_population.copy()
countries_population['iso_3'] = countries_population.country.apply(get_3_code_country)

countries_population



Unnamed: 0_level_0,Total,country,population,per_capita,iso_3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States,852,United States,278357000,30.61,UMI
Russian Federation,223,Russian Federation,146934000,15.18,RUS
China,149,China,1277558000,1.17,CHN
India,145,India,1013662000,1.43,IOT
France,125,France,59225700,21.11,FRA
United Kingdom,117,United Kingdom,59623400,19.62,GBR
Germany,106,Germany,82164700,12.90,DEU
Canada,78,Canada,31147000,25.04,CAN
Australia,76,Australia,23840314,31.88,AUS
Japan,74,Japan,126714000,5.84,JPN


In [64]:
with open('data/kaggle/users_created.json', 'w') as f:
    f.write(countries_population[["country","iso_3","Total","per_capita"]].to_json(orient='records'))
