## Processing the Seoul living population dataset

In this notebook, I processed the living population dataset in the seoul open data portal for the data dashboard (index.html). As a result of the notebook, a csv file which contained the average of living populations of each administrative neighborhood (행정동) was created.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os

In [3]:
dataset_address = os.listdir('data by month')

In [4]:
dataset_dict = {}
for d in dataset_address:
    month = d.replace('.csv','')
    target_ = 'data by month/'+d
    dataset_dict[month] = pd.read_csv(target_)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
def processing_(month, month_dataset):
    df = month_dataset.copy() 
    df = df.groupby(['행정동코드', '시간대구분'],as_index=False).mean()
    male = df.columns.tolist()[4:18]
    female = df.columns.tolist()[18:]
    
    df['male'] = df[male].sum(axis=1)
    df['female'] = df[female].sum(axis=1)
    
    df = df.rename(columns={'행정동코드':'adm_code',
                            '시간대구분':'time',
                            '총생활인구수':'total'})
    
    df = df[['adm_code','time','total','male','female']]
    df = df.astype('int32')
    df = df.pivot(index='adm_code', columns='time',values=['total','male','female'])
    columns = []
    
    for p in df.columns.levels[0]:
        for i in df.columns.levels[1]:
            columns.append(p+'_'+month+'_'+str(i))
            
    df.columns = columns
    df['total_'+month+'_avg']=df[df.columns[:24]].mean(axis=1)
    df['male_'+month+'_avg']=df[df.columns[:24]].mean(axis=1)
    df['female_'+month+'_avg']=df[df.columns[:24]].mean(axis=1)
    return df

In [6]:
for d in dataset_dict.keys():
    print(d)
    dataset_dict[d] = processing_(d,dataset_dict[d]) 

Apr
Aug
Dec
Feb
Jan
Jul
Jun
Mar
May
Nov
Oct
Sept


In [7]:
living_pop = pd.concat([dataset_dict[d] for d in dataset_dict.keys()],axis=1)

In [8]:
coord = pd.read_csv('dong_coord.csv',index_col=0)

In [9]:
coord = coord[['adm_cd2','lat','lng']]

In [10]:
coord.adm_cd2 = coord.adm_cd2//100

In [11]:
living_pop = living_pop.merge(coord,left_index=True, right_on='adm_cd2')

In [12]:
living_pop = living_pop.set_index('adm_cd2')

In [13]:
living_pop

Unnamed: 0_level_0,total_Apr_0,total_Apr_1,total_Apr_2,total_Apr_3,total_Apr_4,total_Apr_5,total_Apr_6,total_Apr_7,total_Apr_8,total_Apr_9,...,female_Sept_19,female_Sept_20,female_Sept_21,female_Sept_22,female_Sept_23,total_Sept_avg,male_Sept_avg,female_Sept_avg,lat,lng
adm_cd2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11110515,15268,15310,15303,15300,15474,15579,16285,18373,20585,21407,...,8302,8010,7594,7374,7653,16705.750000,16705.750000,16705.750000,37.584768,126.969029
11110530,21557,21028,20605,21748,22767,22054,24529,31030,42664,49153,...,17362,15250,12936,10708,8673,30634.083333,30634.083333,30634.083333,37.573898,126.969923
11110540,4643,4582,4628,4632,4658,4773,5161,6451,7760,9197,...,3679,3113,2576,2302,2412,7322.625000,7322.625000,7322.625000,37.586045,126.981531
11110550,14292,13985,13915,13874,13242,13302,13524,13965,14343,14536,...,7494,7643,7743,7877,7225,13677.541667,13677.541667,13677.541667,37.595435,126.964300
11110560,21776,22122,22421,22484,22887,23129,23092,22077,20993,20142,...,10108,10417,10819,11205,12665,20909.375000,20909.375000,20909.375000,37.613675,126.968862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11740650,32598,32270,32175,32062,31481,31264,30783,29990,27737,27354,...,14431,14569,14737,15123,16612,27180.416667,27180.416667,27180.416667,37.534335,127.127824
11740660,29389,30315,30466,30474,31297,31076,30539,29609,28026,27479,...,14871,14942,14842,14828,14117,28364.166667,28364.166667,28364.166667,37.528406,127.133657
11740685,58218,57970,57960,57868,57554,56646,55028,52339,49238,47555,...,29720,30442,31074,31605,31488,54987.458333,54987.458333,54987.458333,37.539865,127.147347
11740690,6211,6121,6100,6123,6025,6153,6176,6652,6776,6780,...,3924,4181,4426,4629,4081,8066.041667,8066.041667,8066.041667,37.522880,127.140035


In [14]:
time_list = [t for t in range(24)]
time_list.append('avg')
test = {}
for i in time_list:
    in_ = str(i)
    not_in = [str(s) for s in range(24) if s != i]
    test[in_] = [k for k in living_pop.columns if in_ in k]
    if in_.isdigit():
        if i<10:
            for n in not_in:
                test[in_]  = [k for k in test[in_] if n not in k]

In [15]:
for t in test.keys():
    living_pop['total_'+'avg_'+t] =living_pop[[t for t in test[t] if 'total' in t]].mean(axis=1)
    living_pop['male_'+'avg_'+t] =living_pop[[t for t in test[t] if 'male' in t]].mean(axis=1)
    living_pop['female_'+'avg_'+t] =living_pop[[t for t in test[t] if 'female' in t]].mean(axis=1)

In [16]:
living_pop['gu'] = living_pop.index // 1000

In [17]:
living_pop.head()

Unnamed: 0_level_0,total_Apr_0,total_Apr_1,total_Apr_2,total_Apr_3,total_Apr_4,total_Apr_5,total_Apr_6,total_Apr_7,total_Apr_8,total_Apr_9,...,total_avg_22,male_avg_22,female_avg_22,total_avg_23,male_avg_23,female_avg_23,total_avg_avg,male_avg_avg,female_avg_avg,gu
adm_cd2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11110515,15268,15310,15303,15300,15474,15579,16285,18373,20585,21407,...,14008.25,6983.666667,7330.916667,14821.666667,7390.208333,7946.166667,17081.635417,17081.635417,17081.635417,11110
11110530,21557,21028,20605,21748,22767,22054,24529,31030,42664,49153,...,23843.583333,11904.583333,12148.5,19417.166667,9692.625,9820.583333,34088.340278,34088.340278,34088.340278,11110
11110540,4643,4582,4628,4632,4658,4773,5161,6451,7760,9197,...,4686.916667,2340.583333,2230.583333,4816.166667,2404.75,2294.75,7252.423611,7252.423611,7252.423611,11110
11110550,14292,13985,13915,13874,13242,13302,13524,13965,14343,14536,...,14693.333333,7333.041667,8122.083333,13536.833333,6756.5,7489.5,13851.381944,13851.381944,13851.381944,11110
11110560,21776,22122,22421,22484,22887,23129,23092,22077,20993,20142,...,19668.5,9811.083333,10915.333333,21708.916667,10828.208333,11966.166667,20080.697917,20080.697917,20080.697917,11110


In [19]:
living_pop.to_csv('living_pop_neighborhood.csv')

In [34]:
living_pop_total = living_pop.sum(axis=0)

In [35]:
living_pop_total = living_pop_total.drop('gu')

In [36]:
living_pop_total.to_csv('test.csv')

  """Entry point for launching an IPython kernel.
