In [1]:
import pandas as pd
import datetime
from sqlalchemy import create_engine

# Importing data

In [2]:
school_holidays = pd.read_csv("fr-en-calendrier-scolaire.csv", sep=";")
school_holidays.head()

Unnamed: 0,description,population,start_date,end_date,location,zones,annee_scolaire
0,Vacances d'Été,Élèves,2010-07-08T00:00:00+02:00,2010-09-09T00:00:00+02:00,Corse,Corse,2009-2010
1,Vacances de Printemps,-,2011-04-18T00:00:00+02:00,2011-05-02T00:00:00+02:00,Corse,Corse,2010-2011
2,Vacances de Noël,-,2011-12-19T00:00:00+01:00,2012-01-03T00:00:00+01:00,Corse,Corse,2011-2012
3,Vacances d'Hiver,-,2013-02-18T00:00:00+01:00,2013-03-04T00:00:00+01:00,Corse,Corse,2012-2013
4,Vacances d'Hiver,-,2014-02-24T00:00:00+01:00,2014-03-10T00:00:00+01:00,Corse,Corse,2013-2014


# Cleaning data

#### Removing unwanted "population" values 
We're keeping only students from metropolitan area

In [3]:
school_holidays["population"].value_counts().index.to_list()

['-',
 'Enseignants',
 'Élèves',
 'Élèves du premier degré',
 'Enseignants du premier degré',
 'Élèves du second degré',
 'Guadeloupe & Saint-Martin',
 'Saint-Barthélémy',
 'Saint-Martin',
 'Premier degré et collèges',
 'Enseignants du second degré',
 'Guadeloupe sauf Saint-Martin',
 'Élèves des lycées',
 'Enseignants des lycées',
 'Enseignants des collèges',
 'Élèves des collèges',
 'Lycées',
 'Premier degré',
 'Collèges']

In [4]:
pop_to_drop = ['Enseignants', 'Enseignants du premier degré', 'Guadeloupe & Saint-Martin', 'Saint-Barthélémy', 'Saint-Martin', 'Enseignants du second degré', 'Guadeloupe sauf Saint-Martin', 'Enseignants des lycées', 'Enseignants des collèges']
school_holidays = school_holidays[~school_holidays["population"].isin(pop_to_drop)]
school_holidays.value_counts("population")

population
-                            1427
Élèves                        223
Élèves du premier degré         6
Élèves du second degré          4
Premier degré et collèges       3
Élèves des collèges             2
Élèves des lycées               2
Collèges                        1
Lycées                          1
Premier degré                   1
Name: count, dtype: int64

#### Removing unwanted zones (we're keeping only zone A, B, C)

In [5]:
school_holidays.zones.value_counts()

zones
Zone B                      596
Zone A                      424
Zone C                      265
Corse                        72
Polynésie                    51
Réunion                      45
Martinique                   36
Guyane                       35
Saint Pierre et Miquelon     34
Mayotte                      31
Guadeloupe                   31
Wallis et Futuna             25
Nouvelle Calédonie           25
Name: count, dtype: int64

In [6]:
school_holidays["zones"].value_counts().index.to_list()
zones_to_keep = ['Zone A', 'Zone B', 'Zone C']
school_holidays = school_holidays[school_holidays["zones"].isin(zones_to_keep)]

#### Removing location information

In [7]:
school_holidays.location.value_counts()

location
Limoges             53
Paris               53
Bordeaux            53
Besançon            53
Versailles          53
Toulouse            53
Nancy-Metz          53
Créteil             53
Montpellier         53
Grenoble            53
Poitiers            53
Dijon               53
Amiens              53
Clermont-Ferrand    53
Rennes              53
Nantes              53
Lyon                53
Strasbourg          53
Orléans-Tours       53
Nice                53
Lille               53
Aix-Marseille       53
Reims               53
Normandie           40
Caen                13
Rouen               13
Name: count, dtype: int64

In [8]:
school_holidays.drop(columns=["location", "population"], inplace=True)
school_holidays.drop_duplicates(inplace=True)

### Formating date to keep only the day

In [9]:
school_holidays["start_date"] = school_holidays["start_date"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
school_holidays["end_date"] = school_holidays["end_date"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
school_holidays.sort_values(by=["start_date"], inplace=True)

### Renaming columns

['Description', 'start_date', 'end_date', 'zone', 'annee_scolaire'] -> ['description', 'start_date', 'end_date', 'zone', 'school_year']

In [13]:
school_holidays.columns = ['description', 'start_date', 'end_date', 'zone', 'school_year']

In [11]:
school_holidays.head()

Unnamed: 0,description,start_date,end_date,zone,school_year
8,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone A,2017-2018
380,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone C,2017-2018
10,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone B,2017-2018
18,Vacances de Noël,2017-12-23,2018-01-08,Zone A,2017-2018
20,Vacances de Noël,2017-12-23,2018-01-08,Zone B,2017-2018


# Saving data

In [12]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/Incidents_RATP')
school_holidays.to_sql('school_holidays', engine, if_exists='replace', index=False)

159