Created 04/02/2019

Last update 04/02/2019

Code that analyse the population of Ille-et-Vilaine

# 1. Import library

In [1]:
import pymongo
from __future__ import division
import numpy as np
import pandas as pd
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.layouts import row
output_notebook() # To display plot into Jupyter notebook
import Fcn_BasicStatisticalAnalysisOfPopulation

# 2. Data loading

In [2]:
# User inputs
DatabaseName = 'GeoApiGouv'
CollectionName = 'PopulationCity'

# Connection to MongoDB
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient[DatabaseName]

#Query all documents from MongoD
QueryMongo = list(mydb[CollectionName].find({}))

# 3. Statistical analysis

## 3.1. Arrange data loaded

In [3]:
NumberOfCity      = len(QueryMongo)
AllCityNames      = [QueryMongo[x]['nom'] for x in range(0,NumberOfCity)]
AllPopulationList = [QueryMongo[x]['population'] for x in range(0,NumberOfCity)]
AllAreaList       = [QueryMongo[x]['surface'] for x in range(0,NumberOfCity)]
AllDensityList    = np.array(AllPopulationList)/np.array(AllAreaList)*100

In [4]:
#Following to get the top 3 and bottom 3 - Change "by=" and "ascending=" accordingly
df = pd.DataFrame({"City name":AllCityNames,"Population":AllPopulationList,"Area":AllAreaList,"Density":AllDensityList})
df.sort_values(by=['Density'], ascending = False).head(3)

Unnamed: 0,Area,City name,Density,Population
227,5038,Rennes,4195.57364,211373
111,1043,Fougères,1933.844679,20170
21,56,Bécherel,1289.285714,722


## 3.2. Population analysis

In [44]:
TotalPopulation   = df['Population'].sum()
AveragePopulation = int(df['Population'].mean()) # Population moyenne 
MedianPopulation  = int(df['Population'].median()) # Population mediane
MinPopulation     = df['Population'].min() # Population la plus faible
MaxPopulation     = df['Population'].max() # Population la plus élevée

print ('Nombre de ville: ' + "{:,}".format(NumberOfCity))
print ('Population totale: ' + "{:,}".format(TotalPopulation))
print ('Population moyenne par ville: '+ "{:,}".format(AveragePopulation))
print ('Population mediane par ville: '+ "{:,}".format(MedianPopulation))
print ('Population la plus faible: '+ "{:,}".format(MinPopulation) + ' (' + df.loc[df['Population'] == MinPopulation]['City name'].to_string(index=False).encode('utf-8') + ')')
print ('Population la plus élevée: '+ "{:,}".format(MaxPopulation) + ' (' + df.loc[df['Population'] == MaxPopulation]['City name'].to_string(index=False).encode('utf-8') + ')')

Nombre de ville: 345
Population totale: 1,019,923
Population moyenne par ville: 2,956
Population mediane par ville: 1,314
Population la plus faible: 107 (Bléruais)
Population la plus élevée: 211,373 (Rennes)


### 3.2.1. Population histogram

In [45]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinPopulation,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxPopulation,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(list(df['Population']), bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show for all log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [46]:
# Plot histogram
p1 = figure(plot_height=300, title="City population",
           x_axis_label='City population bins', y_axis_label='Count')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format(bin_edges[el]) for el in range(0,len(bin_edges))}

p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

In [47]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City population distribution",
           x_axis_label='City population bins', y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format(bin_edges[el]) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)

## 3.3. Area analysis

In [48]:
TotalArea   = df['Area'].sum() # Surface totale
AverageArea = int(df['Area'].mean()) # Surface moyenne 
MedianArea  = int(df['Area'].median()) # Surface mediane
MinArea     = df['Area'].min() # Surface la plus faible
MaxArea     = df['Area'].max() # Surface la plus élevée

print ('Surface totale: ' + "{:,}".format(TotalArea/100) + ' km' + (u"\u00b2").encode('utf-8')) # [km2] originally surface in hectare so need to /100
print ('Surface moyenne: '+ "{:,}".format(AverageArea/100) + ' km' + (u"\u00b2").encode('utf-8')) # [km2] originally surface in hectare
print ('Surface mediane: '+ "{:,}".format(MedianArea/100) + ' km' + (u"\u00b2").encode('utf-8')) # [km2] originally surface in hectare
print ('Surface la plus faible: '+ "{:,}".format(MinArea/100) + ' km' + (u"\u00b2").encode('utf-8')  + ' (' + df.loc[df['Area'] == MinArea]['City name'].to_string(index=False).encode('utf-8') + ')') # [km2] originally surface in hectare
print ('Surface la plus élevée: '+ "{:,}".format(MaxArea/100) + ' km' + (u"\u00b2").encode('utf-8')  + ' (' + df.loc[df['Area'] == MaxArea]['City name'].to_string(index=False).encode('utf-8') + ')') # [km2] originally surface in hectare

Surface totale: 6,840.54 km²
Surface moyenne: 19.82 km²
Surface mediane: 15.81 km²
Surface la plus faible: 0.56 km² (Bécherel)
Surface la plus élevée: 110.66 km² (Paimpont)


### 3.3.1 Area histogram

In [53]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinArea,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxArea,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(list(df['Area']), bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show for all log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [54]:
# Plot histogram
p1 = figure(plot_height=300, title="City area",
           x_axis_label='City area bins'+ ' [km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Count [#]')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el]/100)) if (bin_edges[el]/100) < 1 else "{:,}".format(int(bin_edges[el]/100)) for el in range(0,len(bin_edges))}
p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

### 3.3.2. Population cumulative distribution

In [55]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City area distribution",
           x_axis_label='City area bins'+ ' [km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el]/100)) if (bin_edges[el]/100) < 1 else "{:,}".format(int(bin_edges[el]/100)) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)

## 3.4. Population density analysis

In [68]:
TotalDensity   = int(df['Population'].sum()/(df['Area'].sum()/100)) # pop/km2
AverageDensity = int(df['Density'].mean()) # Population moyenne 
MedianDensity  = int(df['Density'].median()) # Population mediane
MinDensity     = df['Density'].min() # Population la plus faible
MaxDensity     = df['Density'].max() # Population la plus élevée

print ('Density totale: ' + "{:,}".format(TotalDensity) + ' pop/km' + (u"\u00b2").encode('utf-8')) 
print ('Density moyenne: '+ "{:,}".format(AverageDensity) + ' pop/km' + (u"\u00b2").encode('utf-8'))
print ('Density mediane: '+ "{:,}".format(MedianDensity) + ' pop/km' + (u"\u00b2").encode('utf-8')) 
print ('Density la plus faible: '+ "{:,}".format(int(MinDensity)) + ' pop/km' + (u"\u00b2").encode('utf-8') + ' (' + df.loc[df['Density'] == MinDensity]['City name'].to_string(index=False).encode('utf-8') + ')')  
print ('Density la plus élevée: '+ "{:,}".format(int(MaxDensity)) + ' pop/km' + (u"\u00b2").encode('utf-8')  + ' (' + df.loc[df['Density'] == MaxDensity]['City name'].to_string(index=False).encode('utf-8') + ')')

Density totale: 149 pop/km²
Density moyenne: 155 pop/km²
Density mediane: 74 pop/km²
Density la plus faible: 14 pop/km² (Paimpont)
Density la plus élevée: 4,195 pop/km² (Rennes)


### 3.4.1 Population density histogram

In [72]:
# Prepare the data for plot
start = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MinDensity,'Min')
stop = Fcn_BasicStatisticalAnalysisOfPopulation.DefineMinMaxLog(MaxDensity,'Max')
bins = Fcn_BasicStatisticalAnalysisOfPopulation.CreateLogBin(start,stop)
hist, bin_edges = np.histogram(list(df['Density']), bins=bins)

# Remove the first and last value when hist is 0 - Comment out this line to show forall log scale
hist, bin_edges = Fcn_BasicStatisticalAnalysisOfPopulation.RemoveLastFirstValuesWhenZero(hist, bin_edges)

In [73]:
# Plot histogram
p1 = figure(plot_height=300, title="City population density",
           x_axis_label='City population density bins'+ ' [pop/km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Count [#]')
p1.quad(top=list(hist), bottom=0, left=list(range(0,len(bin_edges)))[:-1], right=list(range(0,len(bin_edges)))[1:],
       fill_color = "#0276FD", line_color = "#5F5B5B", alpha=1)

p1.xaxis.ticker = list(range(0,len(bin_edges)))
p1.xaxis.major_label_orientation = 3.14/4
p1.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el])) if (bin_edges[el]) < 1 else "{:,}".format(int(bin_edges[el])) for el in range(0,len(bin_edges))}
p1.title.text_font_size = '12pt'
p1.title.text_color = "#0276FD"
p1.title.text_font = "verdana"
p1.yaxis.axis_label_text_font = "verdana"
p1.yaxis.axis_label_text_font_style = "normal"
p1.xaxis.axis_label_text_font = "verdana"
p1.xaxis.axis_label_text_font_style = "normal"
p1.xaxis.major_label_text_font = "verdana"
p1.yaxis.major_label_text_font = "verdana"

show(p1)

### 3.4.2 Population density cumulative distribution

In [74]:
#Plot cumulative distribution
p2 = figure(plot_height=300, title="City population density distribution",
           x_axis_label='City population density bins'+ ' [pop/km' + (u"\u00b2]").encode('utf-8'), y_axis_label='Percentile [%ile]')
p2.step(list(range(0,len(bin_edges))),[0]+[list(np.cumsum(hist))[x]/NumberOfCity for x in range(0,len(hist))],
       line_color="#0276FD",line_width=2)

p2.xaxis.ticker = list(range(0,len(bin_edges)))
p2.xaxis.major_label_orientation = 3.14/4
p2.xaxis.major_label_overrides = {el:"{:,}".format((bin_edges[el])) if (bin_edges[el]) < 1 else "{:,}".format(int(bin_edges[el])) for el in range(0,len(bin_edges))}

p2.title.text_font_size = '12pt'
p2.title.text_color = "#0276FD"
p2.title.text_font = "verdana"
p2.yaxis.axis_label_text_font = "verdana"
p2.yaxis.axis_label_text_font_style = "normal"
p2.xaxis.axis_label_text_font = "verdana"
p2.xaxis.axis_label_text_font_style = "normal"
p2.xaxis.major_label_text_font = "verdana"
p2.yaxis.major_label_text_font = "verdana"

show(p2)

## 3.5 Population spatial analysis

### 3.5.1. Prepare the data

In [110]:
df['Latitude'] = [QueryMongo[x]['centre']['coordinates'][1] for x in range(0,NumberOfCity)]
df['Longitude'] = [QueryMongo[x]['centre']['coordinates'][0] for x in range(0,NumberOfCity)]
CityNameDistance = 'Rennes'
df['DistanceTo' + CityNameDistance] = [Fcn_BasicStatisticalAnalysisOfPopulation.DistanceCoordToKm(float(df.loc[df['City name'] == CityNameDistance]['Latitude']),float(df.loc[df['City name'] == CityNameDistance]['Longitude']),df['Latitude'][x],df['Longitude'][x]) for x in range(0,NumberOfCity)]
df.head()

Unnamed: 0,Area,City name,Density,Population,Latitude,Longitude,DistanceToRennes,DistanceToSaint-Malo
0,3020,Acigné,208.013245,6282,48.140329,-1.516048,12.612759,67.290686
1,2556,Amanlis,63.223787,1616,47.993798,-1.4958,18.900247,82.226049
2,1266,Andouillé-Neuville,66.113744,837,48.298632,-1.591675,21.88349,49.618939
3,942,Antrain,145.010616,1366,48.461888,-1.478346,41.789843,44.005262
4,469,Arbrissel,61.833689,290,47.925517,-1.291799,35.483827,96.394868


### 3.5.2. Scatter plot - Population density vs distance to most populated city

In [134]:
# Get only up to a defined distance, i.e. 30km
MaximumDistanceToCity = 30 # km - Maximum distance to be consider from the 'CityNameDistance'

In [135]:
p = figure(plot_height=300, title="City density population vs distance to most populated city",
           x_axis_label='Distance to the most populated city [km]', y_axis_label='Population density')
p.circle(df[(df['DistanceToRennes']<=MaximumDistanceToCity) & (df['DistanceToRennes']>0)]['DistanceTo' + CityNameDistance],df[(df['DistanceToRennes']<=MaximumDistanceToCity) & (df['DistanceToRennes']>0)]['Density'],
        color="#0276FD")

p.title.text_font_size = '12pt'
p.title.text_color = "#0276FD"
p.title.text_font = "verdana"
p.yaxis.axis_label_text_font = "verdana"
p.yaxis.axis_label_text_font_style = "normal"
p.xaxis.axis_label_text_font = "verdana"
p.xaxis.axis_label_text_font_style = "normal"
p.xaxis.major_label_text_font = "verdana"
p.yaxis.major_label_text_font = "verdana"

show(p)