### Install Faker to create FakeData

In [1]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [2]:
# On précise que l'on veut des données françaises
from faker import Faker
fake = Faker("fr_FR")

In [3]:
# On veut créer de fausse date avec le bon format

In [4]:
import datetime
import random

### Test datetime generator

In [5]:
fake.date_time_between(start_date=datetime.date(2020, 1,1), end_date=datetime.date(2022, 1,1)).strftime("%Y%m%d%H%M%S")

'20200520171234'

### Test email generator

# Making evenements table data

## Making SI parc data 

In [345]:
import csv
import pandas as pd
from faker import Faker
import datetime
import random
from faker.providers import DynamicProvider

constructeur_id_provider = DynamicProvider(
     provider_name="constructeur_id",
     elements=[1,2,3,4,5],
)

def fake_compteurs_generation(records):
    fake = Faker('fr_FR')
    
    fake.add_provider(constructeur_id_provider)
    
    compteurs = []
    for i in range(records):
        compteurs.append({
            "id_compteur": i,
            "constructeur_id": fake.constructeur_id()
                })
        
    return compteurs


compteurs_df = pd.DataFrame(fake_compteurs_generation(20))

In [9]:
compteurs_df

Unnamed: 0,id_compteur,constructeur_id
0,0,2
1,1,1
2,2,2
3,3,5
4,4,4
5,5,1
6,6,3
7,7,1
8,8,3
9,9,2


In [10]:
compteurs_df.to_csv("./SI_Park_Data/compteurs.csv", index=False)

# Making concentrateur data

Use the same generator as compteur

In [11]:
concentrateur_df = pd.DataFrame(fake_compteurs_generation(4))
concentrateur_df.to_csv("./SI_Park_Data/concentrateurs.csv", index=False)

## Making commune data

In [12]:
communes = pd.read_csv("./correspondance-code-insee-code-postal.csv", sep=";")

In [13]:
communes.head()

Unnamed: 0,Code INSEE,Code Postal,Commune,Département,Région,Statut,Altitude Moyenne,Superficie,Population,geo_point_2d,geo_shape,ID Geofla,Code Commune,Code Canton,Code Arrondissement,Code Département,Code Région
0,59416,59190,MORBECQUE,NORD,NORD-PAS-DE-CALAIS,Commune simple,23.0,4455.0,2.7,"50.67689342861573,2.536216144331492","{""coordinates"": [[[2.501239302134784, 50.63986...",1237,416,30,4,59,31
1,22102,22330,LANGOURLA,COTES-D'ARMOR,BRETAGNE,Commune simple,170.0,2165.0,0.6,"48.284641107667674,-2.415501011324659","{""coordinates"": [[[-2.416298136623701, 48.2530...",10183,102,9,1,22,53
2,31225,31310,GOUTEVERNISSE,HAUTE-GARONNE,MIDI-PYRENEES,Commune simple,264.0,485.0,0.2,"43.214026301449536,1.173765920286677","{""coordinates"": [[[1.165580460427912, 43.20037...",20555,225,27,1,31,73
3,23025,23220,BONNAT,CREUSE,LIMOUSIN,Chef-lieu canton,355.0,4554.0,1.3,"46.32301933418213,1.913450631364578","{""coordinates"": [[[1.952331841746855, 46.28624...",3586,25,6,2,23,74
4,38522,38740,VALJOUFFREY,ISERE,RHONE-ALPES,Commune simple,2009.0,12644.0,0.1,"44.88153491127053,6.07950399615059","{""coordinates"": [[[6.056489736635054, 44.81581...",1082,522,36,1,38,82


In [14]:
regions = communes.Région.unique()
regions_df = pd.DataFrame(regions, columns=["libelle_region"])
regions_df['index_col'] = regions_df.index
regions_df = pd.DataFrame(regions_df,columns=['index_col','libelle_region'])
regions_df = regions_df.rename(columns={"index_col":"id_region"})

In [15]:
regions_df

Unnamed: 0,id_region,libelle_region
0,0,NORD-PAS-DE-CALAIS
1,1,BRETAGNE
2,2,MIDI-PYRENEES
3,3,LIMOUSIN
4,4,RHONE-ALPES
5,5,BOURGOGNE
6,6,PICARDIE
7,7,LANGUEDOC-ROUSSILLON
8,8,CENTRE
9,9,CHAMPAGNE-ARDENNE


In [16]:
regions_df.to_csv("./SI_Park_Data/region.csv", index=False)

In [17]:
join_regions = regions_df.rename(columns={"libelle_region":"Région"})
join_regions["Région"] = join_regions["Région"].astype(str)

In [18]:
departements=pd.DataFrame(communes,columns=['Code Département','Département', 'Région'])
departements["Région"] = departements["Région"].astype(str)

In [19]:
df3=departements.set_index('Région').join(join_regions.set_index('Région'), how='left')

In [20]:
df3.reset_index(inplace=True)

In [21]:
df3

Unnamed: 0,Région,Code Département,Département,id_region
0,ALSACE,68,HAUT-RHIN,12
1,ALSACE,67,BAS-RHIN,12
2,ALSACE,67,BAS-RHIN,12
3,ALSACE,68,HAUT-RHIN,12
4,ALSACE,68,HAUT-RHIN,12
...,...,...,...,...
36737,RHONE-ALPES,73,SAVOIE,4
36738,RHONE-ALPES,38,ISERE,4
36739,RHONE-ALPES,26,DROME,4
36740,RHONE-ALPES,42,LOIRE,4


In [22]:
dep_final_df = pd.DataFrame(df3, columns=["Code Département","id_region","Département"])\
.rename(columns={"Code Département":"id_departement",
                "Département":"libelle_departement"})

In [23]:
## unique values
dep_final_df.drop_duplicates()

Unnamed: 0,id_departement,id_region,libelle_departement
0,68,12,HAUT-RHIN
1,67,12,BAS-RHIN
904,47,19,LOT-ET-GARONNE
905,24,19,DORDOGNE
906,64,19,PYRENEES-ATLANTIQUES
...,...,...,...
33858,42,4,LOIRE
33860,01,4,AIN
33861,73,4,SAVOIE
33862,74,4,HAUTE-SAVOIE


In [24]:
dep_final_df.drop_duplicates().to_csv("./SI_Park_Data/departements.csv", index=False)

# making communes

In [25]:
communes_df = pd.DataFrame(communes, columns=["Code Commune","Code INSEE", "Commune", "Département", "Code Département"])

In [26]:
communes_df = communes_df.rename(columns={"Commune":"libelle_commune",
                                          "Code Commune":"id_commune",
                "Code Département":"id_departement",})

In [27]:
communes_df

Unnamed: 0,id_commune,Code INSEE,libelle_commune,Département,id_departement
0,416,59416,MORBECQUE,NORD,59
1,102,22102,LANGOURLA,COTES-D'ARMOR,22
2,225,31225,GOUTEVERNISSE,HAUTE-GARONNE,31
3,25,23025,BONNAT,CREUSE,23
4,522,38522,VALJOUFFREY,ISERE,38
...,...,...,...,...,...
36737,137,84137,VAISON-LA-ROMAINE,VAUCLUSE,84
36738,248,41248,SOUDAY,LOIR-ET-CHER,41
36739,194,21194,CORGOLOIN,COTE-D'OR,21
36740,605,60605,SARNOIS,OISE,60


In [28]:
df4=communes_df.set_index('id_departement').join(dep_final_df.set_index('id_departement'), how='left')

In [29]:
df4.reset_index(inplace=True)

In [30]:
df4

Unnamed: 0,id_departement,id_commune,Code INSEE,libelle_commune,Département,id_region,libelle_departement
0,01,26,01026,BAGE-LE-CHATEL,AIN,4,AIN
1,01,26,01026,BAGE-LE-CHATEL,AIN,4,AIN
2,01,26,01026,BAGE-LE-CHATEL,AIN,4,AIN
3,01,26,01026,BAGE-LE-CHATEL,AIN,4,AIN
4,01,26,01026,BAGE-LE-CHATEL,AIN,4,AIN
...,...,...,...,...,...,...,...
17287011,97,407,97407,LE PORT,REUNION,26,REUNION
17287012,97,407,97407,LE PORT,REUNION,26,REUNION
17287013,97,407,97407,LE PORT,REUNION,26,REUNION
17287014,97,407,97407,LE PORT,REUNION,26,REUNION


In [34]:
df_communes_filan = pd.DataFrame(df4, columns=["id_commune","id_departement","id_region","Code INSEE", "libelle_commune"])

In [35]:
df_communes_filan = df_communes_filan.drop_duplicates(["id_commune","Code INSEE", "libelle_commune", "id_departement"])

In [37]:
df_communes_filan.reset_index()

Unnamed: 0,index,id_commune,id_departement,id_region,Code INSEE,libelle_commune
0,0,26,01,4,01026,BAGE-LE-CHATEL
1,419,27,01,4,01027,BALAN
2,838,8,01,4,01008,AMBUTRIX
3,1257,452,01,4,01452,VIRIEU-LE-GRAND
4,1676,29,01,4,01029,BEAUPONT
...,...,...,...,...,...,...
36737,17286371,608,97,23,97608,DZAOUDZI
36738,17286500,209,97,23,97209,FORT-DE-FRANCE
36739,17286629,403,97,23,97403,ENTRE-DEUX
36740,17286758,416,97,23,97416,SAINT-PIERRE


In [38]:
df_communes_filan.to_csv("./SI_Park_Data/communes.csv", index=False)

### Create fake event data

In [407]:
from datetime import datetime, timedelta

In [470]:
event_type = ["ev_C", "ev_K", "ev_N"]
equipement_type = ["C","K"]

import random
def fake_evenements_generation(records, date):
    fake = Faker('fr_FR')
    events = []
    init_date = date
    for i in range(records):
        type_evt = random.choices(event_type, cum_weights=[30,38,41], k=1)[0]
        type_equi = "K" if type_evt[-1]!="C"   else "C"
        id_equi = random.randint(1, 20) if type_equi == "C" else random.randint(1, 5)
        #remplacement des capteurs 16 à 20 par les capteurs 21 à 25 le 01/01/2020
        id_equi = random.randint(21,25) if (date > datetime(2020,1,1) and id_equi>15 and id_equi<21) else id_equi       
        events.append({
            "id_evt": date.strftime("%Y%m%d%H%M%S") ,
            "date_occur_evt": date,
            "date_system_evt": init_date + timedelta(hours = 12) if date<init_date + timedelta(hours = 12) else init_date + timedelta(hours = 23, minutes=59, seconds=59),
            "type_evt": type_evt,
            "type_equipement": type_equi,
            "id_equipement": id_equi,
            "info_div":None
                })
        # 1 capteur parmis l'ensemble des capteurs emet une ligne et cela toutes les 10 secondes
        date = date+timedelta(seconds = 10)
        
    return events


event_df = pd.DataFrame(fake_evenements_generation(8640, datetime(2020,3,1)))

In [487]:
# for i in range(1462):
#     date = datetime(2017,1,1)+timedelta(days=i)
#     pd.DataFrame(fake_evenements_generation(8640, date)).to_csv(f"./events/event_{date.strftime('%Y%m%d')}.csv", index=False, )