# EP7: update_data

In [1]:
import pandas as pd
import os
import requests
import shutil

## 1. Download new data

In [53]:
def download_file(input_url, output_file):
    """
    Donwload a file from an url and save it under the specified path and name.
    Args:
    - input_url: url of the file to download
    - output_file: path and name of the file to save.
    """
    response = requests.get(input_url)
    if response.status_code == 200:
        # Process the response content as needed
        content = response.text
        text_file = open(output_file, "wb")
        text_file.write(content.encode("utf-8"))  # to be check...
        text_file.close()
        print(f"{output_file} loaded")
    else:
        print(f"Error accessing the object {input_url}:", response.status_code)

In [54]:
output_path = "../data/raw"
http_url = "https://www.data.gouv.fr/fr/datasets/r/"
year_list = [2019, 2020]


In [55]:
# download list of ressources from gouv.fr
output_file = os.path.join(output_path, "ressources.csv")
download_file("https://www.data.gouv.fr/resources.csv", output_file)

../data/raw\ressources.csv loaded


In [56]:
# download data files according to the year list
file_list_template = ["caracteristiques", "lieux","usagers", "vehicules"]
data_files_list = [f'{item}-{year}.csv' for item in file_list_template for year in year_list]
data_files_list

['caracteristiques-2019.csv',
 'caracteristiques-2020.csv',
 'lieux-2019.csv',
 'lieux-2020.csv',
 'usagers-2019.csv',
 'usagers-2020.csv',
 'vehicules-2019.csv',
 'vehicules-2020.csv']

In [57]:
with open(output_file, "r", encoding="utf-8") as my_file:
    contents = my_file.readline()
    while contents:
        for filename in data_files_list:
            if filename in contents:
                 # 9 = ressource id
                input_url = http_url + contents.split(";")[9][1:-1]
                output_data_file = os.path.join(output_path, filename)
                download_file(input_url, output_data_file)
                break
        contents = my_file.readline()

../data/raw\usagers-2020.csv loaded
../data/raw\vehicules-2020.csv loaded
../data/raw\lieux-2020.csv loaded
../data/raw\caracteristiques-2020.csv loaded
../data/raw\usagers-2019.csv loaded
../data/raw\vehicules-2019.csv loaded
../data/raw\lieux-2019.csv loaded
../data/raw\caracteristiques-2019.csv loaded


## 2. Concaténation des données

In [61]:
# Remove resources.csv:
os.remove(output_file)


In [62]:
def create_folder_if_necessary(path):
            os.makedirs(path, exist_ok=True)
            return path

In [63]:
root_path = "../"
interim_path = create_folder_if_necessary(os.path.join(root_path, "data", "interim"))
raw_path = create_folder_if_necessary(os.path.join(root_path, "data", "raw"))

In [64]:
# Get existing years in data/raw files:
year_list = []
for filename in os.listdir("../data/raw"):
    year = filename[-8:-4]
    if year not in year_list:
        year_list.append(year)
print(year_list)


['2019', '2020', '2021']


In [2]:
# Test jupyter
df1 = pd.read_csv("../data/interim/caracteristiques.csv", sep = ";")
df2 = pd.read_csv("../data/interim/lieux.csv", sep = ";")
df3 = pd.read_csv("../data/interim/usagers.csv", sep = ";")
df4 = pd.read_csv("../data/interim/vehicules.csv", sep = ";")

print("Caracteristiques:", df1.shape)
print("Lieux:", df2.shape)
print("Usagers:", df3.shape)
print("Vehicules:", df4.shape)

  df2 = pd.read_csv("../data/interim/lieux.csv", sep = ";")


Caracteristiques: (163102, 15)
Lieux: (163102, 18)
Usagers: (367425, 15)
Vehicules: (279091, 11)


In [3]:
df = pd.read_csv("../data/interim/lieux.csv", sep = ";")

  df = pd.read_csv("../data/interim/lieux.csv", sep = ";")


In [None]:
# Copy all downloaded files in data/raw_merged: useless?
# for file_template in file_list_template:
    # for year in year_list:
        # src = os.path.join(raw_path, f"{file_template}-{year}.csv")
        # dest = os.path.join(interim_path, f"{file_template}-{year}.csv")
        # shutil.copyfile(src, dest)

In [72]:
# Merge files:
for file_template in file_list_template:
    output_filename = os.path.join(interim_path, f"{file_template}.csv")
    with open(output_filename, "w") as merged_file:
        for index, year in enumerate(year_list):
            input_filename = os.path.join(raw_path, f"{file_template}-{year}.csv")
            with open(input_filename, "r") as file:
                if index != 0:
                    file.readline()  # Throw away header on all but first file
                merged_file.write(file.read())

In [4]:
# Test jupyter
df1 = pd.read_csv("../data/interim/caracteristiques.csv", sep = ";")
df2 = pd.read_csv("../data/interim/lieux.csv", sep = ";", low_memory=False)
df3 = pd.read_csv("../data/interim/usagers.csv", sep = ";")
df4 = pd.read_csv("../data/interim/vehicules.csv", sep = ";")

print("Caracteristiques:", df1.shape)
print("Lieux:", df2.shape)
print("Usagers:", df3.shape)
print("Vehicules:", df4.shape)

Caracteristiques: (163102, 15)
Lieux: (163102, 18)
Usagers: (367425, 15)
Vehicules: (279091, 11)


In [5]:
# le merge a bien eu lieu.

## 3. Preprocessing des nouvelles données:

In [50]:
input_filepath_users = os.path.join(interim_path, "usagers.csv")
input_filepath_caract = os.path.join(interim_path, "caracteristiques.csv")
input_filepath_places = os.path.join(interim_path, "lieux.csv")
input_filepath_veh = os.path.join(interim_path, "vehicules.csv")

In [11]:
import sys
root_path = "../"
sys.path.append(os.path.join(root_path, "src", "data"))
from make_dataset import main


In [13]:

main(input_filepath="../data/interim",
     output_filepath="../data/preprocessed")

  df_places = pd.read_csv(input_filepath_places, sep=";", encoding='utf-8')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_users.grav.replace([1, 2, 3, 4], [1, 3, 4, 2], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_veh['catv'].replace(catv_value, catv_value_new, inplace=True)
The behavior will change in