In [None]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Initial Data Generation

In [None]:
fake = Faker()
random.seed(42)
np.random.seed(42)

NUM_PEOPLE = 50
NUM_MISSIONS = 150

admin_sites = {
    "Université de Franche-Comté, Besançon": "Besançon",
    "Université de Franche-Comté, Montbéliard": "Montbéliard",
    "Université de Franche-Comté, Belfort": "Belfort"
}

cities = list(set([
    "Besançon", "Montbéliard", "Belfort", "Lyon", "Paris",
    "Dijon", "Strasbourg", "Marseille", "Nice"
]))

car_types = ["electric", "diesel", "hybrid", "gasoline"]
events = ["Conference", "Teaching", "Training", "Meeting"]
travel_types = ["car", "train", "plane", "carpool"]
train_types = ["TER", "TGV"]


def generate_person_data(num_people=50):
    people = []
    for i in range(1, num_people + 1):
        admin_address = random.choice(list(admin_sites.keys()))
        has_car = random.choice([True, False])
        person = {
            "person_id": i,
            "first_name": fake.first_name(),
            "last_name": fake.last_name(),
            "home_address": fake.address().replace("\n", ", "),
            "admin_address": admin_address,
            "admin_city": admin_sites[admin_address],
            "has_car": has_car,
            "car_type": random.choice(car_types) if has_car else None,
            "fiscal_hp": random.randint(4, 10) if has_car else None
        }
        people.append(person)
    return pd.DataFrame(people)


def generate_mission_data(person_df, num_missions=150):
    missions = []
    for i in range(1, num_missions + 1):
        person = person_df.sample(1).iloc[0]
        start_city = person["admin_city"]
        end_city = random.choice([c for c in cities if c != start_city])
        start_date = fake.date_between(start_date='-1y', end_date='today')
        duration = random.choice([0, 1, 2])
        end_date = start_date + pd.Timedelta(days=duration)
        travel_type = random.choice(travel_types)
        vehicle_type = (
            random.choice(train_types) if travel_type == "train"
            else random.choice(car_types) if travel_type == "car"
            else "average" if travel_type == "carpool"
            else "N/A"
        )
        is_return_trip = random.choice([True, False])
        km = random.randint(50, 900)
        parking_cost = round(random.uniform(0, 20), 2) if travel_type in ["car", "carpool"] else 0
        hotel_cost = round(random.uniform(30, 120), 2)
        plane_cost = round(random.uniform(80, 180), 2) if travel_type == "plane" else 0
        reimbursement = round(random.uniform(50, 180), 2)
        total_cost = parking_cost + hotel_cost + plane_cost + reimbursement

        missions.append({
            "mission_id": i,
            "person_id": person["person_id"],
            "start_city": start_city,
            "end_city": end_city,
            "start_date": start_date,
            "end_date": end_date,
            "event": random.choice(events),
            "real_move": random.choice([True, False]),
            "travel_type": travel_type,
            "vehicle_type": vehicle_type,
            "is_return_trip": is_return_trip,
            "km": km,
            "parking_cost": parking_cost,
            "hotel_cost": hotel_cost,
            "plane_cost": plane_cost,
            "reimbursement": reimbursement,
            "total_cost": total_cost
        })
    return pd.DataFrame(missions)


def generate_co2_data():
    return pd.DataFrame([
        {"travel_type": "car", "vehicle_type": "gasoline", "co2_per_km": 0.192, "energy_per_km": None},
        {"travel_type": "car", "vehicle_type": "diesel", "co2_per_km": 0.171, "energy_per_km": None},
        {"travel_type": "car", "vehicle_type": "hybrid", "co2_per_km": 0.131, "energy_per_km": None},
        {"travel_type": "car", "vehicle_type": "electric", "co2_per_km": 0.030, "energy_per_km": 0.15},
        {"travel_type": "train", "vehicle_type": "TER", "co2_per_km": 0.012, "energy_per_km": None},
        {"travel_type": "train", "vehicle_type": "TGV", "co2_per_km": 0.003, "energy_per_km": None},
        {"travel_type": "plane", "vehicle_type": "N/A", "co2_per_km": 0.255, "energy_per_km": None},
        {"travel_type": "carpool", "vehicle_type": "average", "co2_per_km": 0.085, "energy_per_km": None},
    ])

person_df = generate_person_data(NUM_PEOPLE)
mission_df = generate_mission_data(person_df, NUM_MISSIONS)
co2_df = generate_co2_data()

(person_df.head(), mission_df.head(), co2_df)


(   person_id   first_name  last_name  \
 0          1        Paige     Dennis   
 1          2       Rachel     Lucero   
 2          3      Carolyn    Barrera   
 3          4  Christopher  Daugherty   
 4          5          Amy   Faulkner   
 
                                         home_address  \
 0            133 Ashley Fields, Robinburgh, MO 37553   
 1  335 Trujillo Spurs Suite 520, Reesechester, AS...   
 2      87790 Diaz Lock Apt. 019, Smithport, ND 93682   
 3     99909 Kim Loop Suite 588, Haneymouth, LA 13659   
 4  31259 Jeffrey Meadows Apt. 999, Lake Glennton,...   
 
                               admin_address   admin_city  has_car  car_type  \
 0      Université de Franche-Comté, Belfort      Belfort     True  electric   
 1  Université de Franche-Comté, Montbéliard  Montbéliard     True    diesel   
 2      Université de Franche-Comté, Belfort      Belfort     True  electric   
 3  Université de Franche-Comté, Montbéliard  Montbéliard     True  electric   
 4     U

In [None]:
output_dir = "/content/drive/MyDrive/Master IoT/Tutor project"

person_df.to_csv(f"{output_dir}/person.csv", index=False)
mission_df.to_csv(f"{output_dir}/mission.csv", index=False)
co2_df.to_csv(f"{output_dir}/co2.csv", index=False)

print("CSV files saved to your Google Drive folder:", output_dir)

CSV files saved to your Google Drive folder: /content/drive/MyDrive/Master IoT/Tutor project


# Generating Data Including Airports, Real Addresses in France, Adding a New Column for the Nearest Airport, etc.


In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import timedelta

fake = Faker('fr_FR')  #use addresses only in France
random.seed(42)
np.random.seed(42)

NUM_PEOPLE = 60
NUM_MISSIONS = 300

admin_sites = {
    "Université de Franche-Comté, Besançon": "Besançon",
    "Université de Franche-Comté, Montbéliard": "Montbéliard",
    "Université de Franche-Comté, Belfort": "Belfort"
}

cities = ["Besançon", "Montbéliard", "Belfort", "Lyon", "Paris", "Strasbourg", "Marseille", "Nice", "Toulouse", "Nantes", "Lille", "Barcelona"]
airports = {
    "Besançon": "Dole",
    "Montbéliard": "EuroAirport Basel",
    "Belfort": "EuroAirport Basel",
    "Lyon": "Lyon-Saint Exupéry",
    "Paris": "Charles de Gaulle",
    "Strasbourg": "Strasbourg Airport",
    "Marseille": "Marseille Provence",
    "Nice": "Nice Côte d'Azur",
    "Toulouse": "Toulouse-Blagnac",
    "Nantes": "Nantes Atlantique",
    "Lille": "Lille Airport",
    "Barcelona": "Barcelona-El Prat"
}

car_types = ["electric", "diesel", "hybrid", "gasoline"]
events = ["Conference", "Teaching", "Training", "Meeting"]
travel_types = ["car", "train", "plane", "carpool"]

def generate_person_data(num_people=NUM_PEOPLE):
    areas = ["Besançon", "Montbéliard", "Belfort"]
    people = []
    for i in range(1, num_people + 1):
        home_city = random.choice(areas)
        person = {
            "person_id": i,
            "first_name": fake.first_name(),
            "last_name": fake.last_name(),
            "home_address": fake.street_address() + f", {home_city}",
            "admin_address": random.choice(list(admin_sites.keys())),
            "has_car": random.choice([True, False]),
            "car_type": random.choice(car_types),
            "fiscal_hp": random.randint(4, 10)
        }
        people.append(person)
    return pd.DataFrame(people)

def generate_mission_data(person_df, num_missions=NUM_MISSIONS):
    missions = []
    date_pool = [fake.date_between(start_date='-6M', end_date='today') for _ in range(30)]
    for i in range(1, num_missions + 1):
        person = person_df.sample(1).iloc[0]
        start_city = random.choice(list(admin_sites.values()))
        end_city = random.choice([c for c in cities if c != start_city])
        start_date = random.choice(date_pool) + timedelta(days=random.choice([-1, 0, 1]))
        duration = random.choice([0, 1, 2])
        end_date = start_date + timedelta(days=duration)
        travel_type = random.choice(travel_types)
        is_return = random.choice([True, False])
        km = random.randint(100, 800)
        parking = round(random.uniform(0, 25), 2) if travel_type in ["car", "carpool"] else 0
        hotel = round(random.uniform(40, 150), 2)
        plane_cost = round(random.uniform(90, 200), 2) if travel_type == "plane" else 0
        reimbursement = round(random.uniform(50, 180), 2)
        total_cost = parking + hotel + plane_cost + reimbursement

        missions.append({
            "mission_id": i,
            "person_id": person["person_id"],
            "start_city": start_city,
            "end_city": end_city,
            "start_date": start_date,
            "end_date": end_date,
            "event": random.choice(events),
            "real_move": True,
            "travel_type": travel_type,
            "vehicle_type": person["car_type"] if travel_type == "car" else "average" if travel_type == "carpool" else "N/A",
            "is_return_trip": is_return,
            "km": km,
            "parking_cost": parking,
            "hotel_cost": hotel,
            "plane_cost": plane_cost,
            "reimbursement": reimbursement,
            "total_cost": total_cost,
            "nearest_airport": airports[start_city] if travel_type == "plane" else "N/A"
        })
    return pd.DataFrame(missions)

person_df = generate_person_data()
mission_df = generate_mission_data(person_df)

output_path = "/content/drive/MyDrive/Master IoT/Tutor project"
person_df.to_csv(f"{output_path}/person_extended.csv", index=False)
mission_df.to_csv(f"{output_path}/mission_extended.csv", index=False)
print("Files saved!")


Files saved!


# Generating data where addresses are bounded to France. Also, we put some models of cars and generate data first for perosns.csv then for missions.csv.

In [None]:
import pandas as pd
import random
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

fake = Faker("fr_FR")
admin_cities = ["Montbéliard", "Besançon", "Belfort"]
airports = {
    "Montbéliard": "Basel-Mulhouse",
    "Besançon": "Dole-Jura",
    "Belfort": "Basel-Mulhouse"
}

car_models_data = [
    ("Audi A4", "gasoline", 5),
    ("Peugeot 208", "diesel", 5),
    ("Renault Clio", "diesel", 5),
    ("Tesla Model 3", "electric", 5),
    ("Toyota Prius", "hybrid", 5),
    ("Fiat Panda", "gasoline", 4)
]
fiscal_hp_values = list(range(4, 11))

people = []
for pid in range(1, 301):
    admin_city = random.choice(admin_cities)
    has_car = random.random() < 0.7

    if has_car:
        model, fuel, capacity = random.choice(car_models_data)
        car_type = fuel
        car_model = model
        car_capacity = capacity
        fiscal_hp = random.choice(fiscal_hp_values)
    else:
        car_type = car_model = car_capacity = fiscal_hp = ""

    person = {
        "person_id": pid,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "home_address": fake.address().replace("\n", ", "),
        "admin_address": f"Université de Franche-Comté, {admin_city}",
        "admin_city": admin_city,
        "has_car": has_car,
        "car_type": car_type,
        "car_model": car_model,
        "car_capacity": car_capacity,
        "fiscal_hp": fiscal_hp
    }
    people.append(person)

person_df = pd.DataFrame(people)

missions = []
base_date = datetime(2025, 5, 1)
end_cities = ["Paris", "Lyon", "Marseille", "Nice", "Strasbourg", "Toulouse", "Dijon"]

for idx, row in person_df.iterrows():
    start_date = base_date + timedelta(days=random.randint(0, 10))
    duration = random.randint(1, 3)
    end_date = start_date + timedelta(days=duration)

    start_city = row["admin_city"]
    end_city = random.choice([c for c in end_cities if c != start_city])
    travel_type = random.choice(["car", "plane"])
    nearest_airport = airports[start_city] if travel_type == "plane" else ""

    km = random.randint(100, 800)
    hotel_cost = duration * 100
    parking_cost = 10 * duration if travel_type == "car" else 0
    plane_cost = 120 if travel_type == "plane" else 0
    reimbursement = random.randint(50, 150)
    total_cost = hotel_cost + parking_cost + plane_cost + reimbursement

    mission = {
        "mission_id": idx + 1,
        "person_id": row["person_id"],
        "start_city": start_city,
        "end_city": end_city,
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": end_date.strftime("%Y-%m-%d"),
        "event": random.choice(["Meeting", "Conference", "Training"]),
        "real_move": True,
        "travel_type": travel_type,
        "vehicle_type": row["car_model"] if travel_type == "car" and row["has_car"] else "",
        "is_return_trip": random.choice([True, False]),
        "km": km,
        "parking_cost": parking_cost,
        "hotel_cost": hotel_cost,
        "plane_cost": plane_cost,
        "reimbursement": reimbursement,
        "total_cost": total_cost,
        "nearest_airport": nearest_airport
    }
    missions.append(mission)

mission_df = pd.DataFrame(missions)

person_df.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/personsnew.csv", index=False)
mission_df.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/missionsnew.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

missions = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/missionsnew.csv")

missions["start_date"] = pd.to_datetime(missions["start_date"])
missions["end_date"] = pd.to_datetime(missions["end_date"])

def random_time(date):
    hour = random.randint(6, 20)
    minute = random.choice([0, 15, 30, 45])
    return date + timedelta(hours=hour, minutes=minute)

missions["start_date"] = missions["start_date"].apply(random_time)
missions["end_date"] = missions["end_date"].apply(random_time)

missions.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/allmissions.csv", index=False)

In [None]:
person_df = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/personsnew.csv")
co2_df = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/co2.csv")

model_co2 = {
    "Audi A4": 0.185,
    "Peugeot 208": 0.155,
    "Renault Clio": 0.160,
    "Tesla Model 3": 0.030,
    "Toyota Prius": 0.100,
    "Fiat Panda": 0.170
}

model_rows = [
    {"travel_type": "car", "vehicle_type": model, "co2_per_km": co2, "energy_per_km": np.nan}
    for model, co2 in model_co2.items()
]
model_df = pd.DataFrame(model_rows)

co2_extended = pd.concat([co2_df, model_df], ignore_index=True)
co2_extended.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/allco2d.csv", index=False)

In [None]:
import pandas as pd

missions_df = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/allmissions.csv")
persons_df = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/personsnew.csv")
co2_df = pd.read_csv("/content/drive/MyDrive/Master IoT/Tutor project/allco2d.csv")
print("Available columns in persons_df:", persons_df.columns)

if "vehicle_type" in missions_df.columns:
    missions_df = missions_df.drop(columns=["vehicle_type"])

#merging with person data to get car_type
merged_df = missions_df.merge(persons_df[["person_id", "car_type"]], on="person_id", how="left")

merged_df["car_type"] = merged_df["car_type"].fillna("N/A")

merged_df = merged_df.rename(columns={"car_type": "vehicle_type"})

#merging with CO₂ values
merged_with_co2 = merged_df.merge(co2_df, on=["travel_type", "vehicle_type"], how="left")

merged_with_co2.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/alldata.csv", index=False)
print("CO₂-enriched mission data saved as: missions_co2_merged.csv")

In [None]:
import pandas as pd

data = [
    ["Audi A4", "gasoline", 0.185, None],
    ["Peugeot 208", "diesel", 0.155, None],
    ["Renault Clio", "diesel", 0.160, None],
    ["Tesla Model 3", "electric", 0.030, 0.15],
    ["Toyota Prius", "hybrid", 0.100, None],
    ["Fiat Panda", "gasoline", 0.170, None],
    ["average_carpool", "average", 0.085, None],
    ["TER", "train", 0.012, None],
    ["TGV", "train", 0.003, None],
    ["plane", "plane", 0.255, None],
]

co2_df = pd.DataFrame(data, columns=["car_model", "car_type", "co2_per_km", "energy_per_km"])

co2_df.to_csv("/content/drive/MyDrive/Master IoT/Tutor project/co22.csv", index=False)
