Created 04/02/2019

Last update 04/02/2019

Code that analyse the population of Ille-et-Vilaine

# 1. Import library

In [1]:
import pymongo
from __future__ import division
import numpy as np
import pandas as pd
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.layouts import row
output_notebook() # To display plot into Jupyter notebook
import Fcn_BasicStatisticalAnalysisOfPopulation

# 2. Data loading

In [2]:
# User inputs
DatabaseName = 'GeoApiGouv'
CollectionName = 'PopulationCity'

# Connection to MongoDB
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient[DatabaseName]

#Query all documents from MongoD
QueryMongo = list(mydb[CollectionName].find({}))

# 3. Statistical analysis

## 3.1. Arrange data loaded

In [3]:
NumberOfCity      = len(QueryMongo)
AllCityNames      = [QueryMongo[x]['nom'] for x in range(0,NumberOfCity)]
AllPopulationList = [QueryMongo[x]['population'] for x in range(0,NumberOfCity)]
AllAreaList       = [QueryMongo[x]['surface'] for x in range(0,NumberOfCity)]
AllDensityList    = np.array(AllPopulationList)/np.array(AllAreaList)*100

In [4]:
#Following to get the top 3 and bottom 3 - Change "by=" and "ascending=" accordingly
df = pd.DataFrame({"City name":AllCityNames,"Population":AllPopulationList,"Area":AllAreaList,"Density":AllDensityList})
df.sort_values(by=['Density'], ascending = False).head(3)

Unnamed: 0,Area,City name,Density,Population
227,5038,Rennes,4195.57364,211373
111,1043,Fougères,1933.844679,20170
21,56,Bécherel,1289.285714,722


## 3.2. Population analysis

In [5]:
TotalPopulation   = sum(AllPopulationList)
AveragePopulation = int(np.mean(AllPopulationList)) # Population moyenne 
MedianPopulation  = int(np.median(AllPopulationList)) # Population mediane
MinPopulation     = np.min(AllPopulationList) # Population la plus faible
MaxPopulation     = np.max(AllPopulationList) # Population la plus élevée

print 'Nombre de ville: ' + "{:,}".format(NumberOfCity)
print 'Population totale: ' + "{:,}".format(TotalPopulation)
print 'Population moyenne par ville: '+ "{:,}".format(AveragePopulation)
print 'Population mediane par ville: '+ "{:,}".format(MedianPopulation)
print 'Population la plus faible: '+ "{:,}".format(MinPopulation) + ' (' + str(list(mydb[CollectionName].find({'population': MinPopulation},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'
print 'Population la plus élevée: '+ "{:,}".format(MaxPopulation) + ' (' + str(list(mydb[CollectionName].find({'population': MaxPopulation},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'

Nombre de ville: 345
Population totale: 1,019,923
Population moyenne par ville: 2,956
Population mediane par ville: 1,314
Population la plus faible: 107 (Bléruais)
Population la plus élevée: 211,373 (Rennes)


### 3.2.1. Population histogram

In [6]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinPopulation,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxPopulation,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(AllPopulationList, bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show forall log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [7]:
# Plot histogram
p1 = figure(plot_height=300, title="City population",
           x_axis_label='City population bins', y_axis_label='Count')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format(bin_edges[el]) for el in range(0,len(bin_edges))}

p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

### 3.2.2. Population cumulative distribution

In [8]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City population distribution",
           x_axis_label='City population bins', y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format(bin_edges[el]) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)

## 3.3. Area analysis

In [9]:
#Statistics about area
TotalArea = sum(AllAreaList)/100 # km2, originally surface in hectare
AverageArea = int(np.mean(AllAreaList)) # Surface moyenne 
MedianArea = int(np.median(AllAreaList)) # Surface mediane
MinArea = np.min(AllAreaList) # Surface la plus faible
MaxArea = np.max(AllAreaList) # Surface la plus élevée

print 'Surface totale: ' + "{:,}".format(TotalArea) + ' km' + (u"\u00b2").encode('utf-8') 
print 'Surface moyenne: '+ "{:,}".format(AverageArea/100) + ' km' + (u"\u00b2").encode('utf-8') 
print 'Surface mediane: '+ "{:,}".format(MedianArea/100) + ' km' + (u"\u00b2").encode('utf-8') 
print 'Surface la plus faible: '+ "{:,}".format(MinArea/100) + ' km' + (u"\u00b2").encode('utf-8') + ' (' + str(list(mydb[CollectionName].find({'surface': MinArea},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'
print 'Surface la plus élevée: '+ "{:,}".format(MaxArea/100) + ' km' + (u"\u00b2").encode('utf-8')  + ' (' + str(list(mydb[CollectionName].find({'surface': MaxArea},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'

Surface totale: 6,840.54 km²
Surface moyenne: 19.82 km²
Surface mediane: 15.81 km²
Surface la plus faible: 0.56 km² (Bécherel)
Surface la plus élevée: 110.66 km² (Paimpont)


### 3.3.1 Area histogram

In [10]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinArea,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxArea,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(AllAreaList, bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show forall log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [11]:
# Plot histogram
p1 = figure(plot_height=300, title="City area",
           x_axis_label='City area bins'+ ' [km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Count [#]')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el]/100)) if (bin_edges[el]/100) < 1 else "{:,}".format(int(bin_edges[el]/100)) for el in range(0,len(bin_edges))}
p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

### 3.3.2. Population cumulative distribution

In [12]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City area distribution",
           x_axis_label='City area bins'+ ' [km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el]/100)) if (bin_edges[el]/100) < 1 else "{:,}".format(int(bin_edges[el]/100)) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)

## 3.4. Population density analysis

In [13]:
#Statistics about area
TotalDensity = int(TotalPopulation/TotalArea) # pop/km2
AverageDensity = int(np.mean(AllDensityList)) # Densité moyenne 
MedianDensity = int(np.median(AllDensityList)) # Densité mediane
MinDensity = int(np.min(AllDensityList)) # Densité la plus faible
MaxDensity = int(np.max(AllDensityList)) # Densité la plus élevée

print 'Densité totale: ' + "{:,}".format(TotalDensity) + ' pop/km' + (u"\u00b2").encode('utf-8') 
print 'Densité moyenne: '+ "{:,}".format(AverageDensity) + ' pop/km' + (u"\u00b2").encode('utf-8') 
print 'Densité mediane: '+ "{:,}".format(MedianDensity) + ' pop/km' + (u"\u00b2").encode('utf-8') 
print 'Densité la plus faible: '+ "{:,}".format(MinDensity) + ' pop/km' + (u"\u00b2").encode('utf-8') #+ ' (' + str(list(mydb[CollectionName].find({'surface': MinDensity},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'
print 'Densité la plus élevée: '+ "{:,}".format(MaxDensity) + ' pop/km' + (u"\u00b2").encode('utf-8') # + ' (' + str(list(mydb[CollectionName].find({'surface': MaxDensity},{'nom':1,'_id':0}))[0]['nom'].encode('utf-8')) + ')'

Densité totale: 149 pop/km²
Densité moyenne: 155 pop/km²
Densité mediane: 74 pop/km²
Densité la plus faible: 14 pop/km²
Densité la plus élevée: 4,195 pop/km²


### 3.4.1 Population density histogram

In [14]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinDensity,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxDensity,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(AllDensityList, bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show forall log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [15]:
# Plot histogram
p1 = figure(plot_height=300, title="City population density",
           x_axis_label='City population density bins'+ ' [pop/km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Count [#]')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el])) if (bin_edges[el]) < 1 else "{:,}".format(int(bin_edges[el])) for el in range(0,len(bin_edges))}
p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

### 3.4.2 Population density cumulative distribution

In [16]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City population density distribution",
           x_axis_label='City population density bins'+ ' [pop/km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el])) if (bin_edges[el]) < 1 else "{:,}".format(int(bin_edges[el])) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)