#### Importing librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

#### Importing and processing incidents

In [2]:
start_incidents = pd.read_csv("start_incidents_2022_01_01_2022_12_01.csv")
start_incidents = start_incidents[['UserName','Timestamp', 'Embedded_text','Emojis','Tweet URL']]
start_incidents.insert(0, 'start/end', 'start')
start_incidents['Timestamp'] = pd.to_datetime(start_incidents['Timestamp']) + datetime.timedelta(hours=1) # Add one hour to Timestamp in order to convert UTC to Paris time
start_incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL
0,start,@Ligne9_RATP,2022-02-17 07:08:05+00:00,[ Incident technique] 06h55 : Une rame station...,⚠,https://twitter.com/Ligne9_RATP/status/1494191...
1,start,@Ligne9_RATP,2022-02-15 16:05:58+00:00,Accident Grave de Personne (Strasbourg Saint D...,⚠,https://twitter.com/Ligne9_RATP/status/1493602...
2,start,@Ligne9_RATP,2022-02-15 15:16:09+00:00,"Accident grave de voyageur, (Strasbourg Saint ...",❌,https://twitter.com/Ligne9_RATP/status/1493589...
3,start,@Ligne9_RATP,2022-02-15 14:28:24+00:00,"Accident grave de voyageur, (Strasbourg Saint ...",❌,https://twitter.com/Ligne9_RATP/status/1493577...
4,start,@Ligne9_RATP,2022-02-15 14:22:02+00:00,Accident Grave de Personne (Strasbourg Saint D...,⚠,https://twitter.com/Ligne9_RATP/status/1493576...


In [3]:
end_incidents = pd.read_csv("end_incidents_2022_01_01_2022_12_01.csv")
end_incidents = end_incidents[['UserName','Timestamp', 'Embedded_text','Emojis','Tweet URL']]
end_incidents.insert(0, 'start/end', 'end')
end_incidents['Timestamp'] = pd.to_datetime(end_incidents['Timestamp']) + datetime.timedelta(hours=1) # Add one hour to Timestamp in order to convert UTC to Paris time
end_incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL
0,end,@Ligne9_RATP,2022-02-18 19:16:10+00:00,L’incident lié à des personnes sur les voies ...,✅,https://twitter.com/Ligne9_RATP/status/1494737...
1,end,@Ligne9_RATP,2022-02-18 19:15:31+00:00,En répercussion de personnes sur les voies à ...,⚠,https://twitter.com/Ligne9_RATP/status/1494737...
2,end,@Ligne9_RATP,2022-02-17 07:28:00+00:00,"[ Incident technique] 07h25 : Fin d'incident,...",✅,https://twitter.com/Ligne9_RATP/status/1494196...
3,end,@Ligne9_RATP,2022-02-16 12:00:01+00:00,Fin d'incident. Le trafic reprend progressivem...,,https://twitter.com/Ligne9_RATP/status/1493902...
4,end,@Ligne9_RATP,2022-02-15 19:13:52+00:00,Accident Grave de Personne (Strasbourg Saint ...,✅,https://twitter.com/Ligne9_RATP/status/1493649...


#### Merging start and end incidents

In [4]:
incidents = pd.concat([start_incidents, end_incidents])
incidents["Timestamp"] = incidents["Timestamp"].apply(lambda x: str(x))
incidents.sort_values(by=['Timestamp'], inplace=True)
incidents.reset_index(inplace=True)
incidents.drop(columns=['index'], inplace=True)
incidents["date"] = incidents["Timestamp"].apply(lambda x: x.split(" ")[0])
incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL,date
0,start,@Ligne9_RATP,2022-01-01 08:29:08+00:00,[Malaise voyageur] 08h28 : Une rame stationne ...,⚠,https://twitter.com/Ligne9_RATP/status/1477180...,2022-01-01
1,start,@Ligne9_RATP,2022-01-02 18:30:02+00:00,"en raison d'un bagage oublié, le trafic est in...",❌,https://twitter.com/Ligne9_RATP/status/1477693...,2022-01-02
2,end,@Ligne9_RATP,2022-01-02 18:59:11+00:00,"fin d'incident (bagage oublié), le trafic repr...",⚠,https://twitter.com/Ligne9_RATP/status/1477701...,2022-01-02
3,end,@Ligne9_RATP,2022-01-02 19:14:04+00:00,Retour à un trafic régulier sur l'ensemble de...,✅,https://twitter.com/Ligne9_RATP/status/1477704...,2022-01-02
4,end,@Ligne9_RATP,2022-01-04 17:03:52+00:00,"fin d'incident (bagage oublié), le trafic repr...",⚠,https://twitter.com/Ligne9_RATP/status/1478396...,2022-01-04


In [5]:
def timestamp_to_time_slot(timestamp : str) :
    """
    Return the time slot of a given timestamp.
    """
    time_slot = int(timestamp[11:13])//2

    time_slots = ["0h-2h","2h-4h","4h-6h","6h-8h","8h-10h","10h-12h","12h-14h","14h-16h","16h-18h","18h-20h","20h-22h","22h-24h"]
    time_slots_dict = {i : time_slots[i] for i in range(len(time_slots))}    
    
    return time_slots_dict[time_slot]

In [6]:
incidents["time_slot"] = incidents["Timestamp"].apply(timestamp_to_time_slot)
incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL,date,time_slot
0,start,@Ligne9_RATP,2022-01-01 08:29:08+00:00,[Malaise voyageur] 08h28 : Une rame stationne ...,⚠,https://twitter.com/Ligne9_RATP/status/1477180...,2022-01-01,8h-10h
1,start,@Ligne9_RATP,2022-01-02 18:30:02+00:00,"en raison d'un bagage oublié, le trafic est in...",❌,https://twitter.com/Ligne9_RATP/status/1477693...,2022-01-02,18h-20h
2,end,@Ligne9_RATP,2022-01-02 18:59:11+00:00,"fin d'incident (bagage oublié), le trafic repr...",⚠,https://twitter.com/Ligne9_RATP/status/1477701...,2022-01-02,18h-20h
3,end,@Ligne9_RATP,2022-01-02 19:14:04+00:00,Retour à un trafic régulier sur l'ensemble de...,✅,https://twitter.com/Ligne9_RATP/status/1477704...,2022-01-02,18h-20h
4,end,@Ligne9_RATP,2022-01-04 17:03:52+00:00,"fin d'incident (bagage oublié), le trafic repr...",⚠,https://twitter.com/Ligne9_RATP/status/1478396...,2022-01-04,16h-18h


#### Initialise negatif_df

In [7]:
columns = ["month_number","week_number","monthday_number","weekday_number","time_slot","ongoing_incident"]

In [8]:
# Pourrait être remplacé par : pandas.date_range() ???

def generate_time_stamps(start_day : str, end_day: str,time_ressolution = 2) :
    """
    Generate a list of time stamps between <start_day> and <end_day> (all day include) with a given time resolution.
    """
    start_day = datetime.datetime.strptime(start_day, '%Y-%m-%d')
    end_day = datetime.datetime.strptime(end_day, '%Y-%m-%d') + datetime.timedelta(days=1)
    time_stamps = []
    
    time_stamp = start_day

    while time_stamp < end_day :
        time_stamps.append(time_stamp)
        time_stamp += datetime.timedelta(hours=time_ressolution)


    return time_stamps

time_stamps = [datetime.datetime.strftime(time_stamp, '%Y-%m-%d %H:%M:%S') for time_stamp in generate_time_stamps("2022-01-01", "2022-12-01")]

In [9]:
negatif_df = pd.DataFrame(time_stamps, columns=["timestamp"])
negatif_df["month_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').month)
negatif_df["week_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').isocalendar().week)
negatif_df["monthday_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').day)
negatif_df["weekday_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday())
negatif_df["time_slot"] = negatif_df["timestamp"].apply(timestamp_to_time_slot)
negatif_df.insert(1, 'date', negatif_df["timestamp"].apply(lambda x : x[:10]))
negatif_df["ongoing_incident"] = False

negatif_df.head()

Unnamed: 0,timestamp,date,month_number,week_number,monthday_number,weekday_number,time_slot,ongoing_incident
0,2022-01-01 00:00:00,2022-01-01,1,52,1,5,0h-2h,False
1,2022-01-01 02:00:00,2022-01-01,1,52,1,5,2h-4h,False
2,2022-01-01 04:00:00,2022-01-01,1,52,1,5,4h-6h,False
3,2022-01-01 06:00:00,2022-01-01,1,52,1,5,6h-8h,False
4,2022-01-01 08:00:00,2022-01-01,1,52,1,5,8h-10h,False


#### Pass line with incident to True in negatif_df

In [10]:
complet_df = pd.merge(negatif_df, incidents[["date","time_slot","Embedded_text", "Tweet URL"]], how="left", on=["date","time_slot"])
complet_df.drop_duplicates(subset=["timestamp","time_slot"], inplace=True, keep="first")
complet_df["ongoing_incident"] = complet_df["Embedded_text"].apply(lambda x : False if pd.isna(x) else True) # if "Embedded_text" is not NaN, set "ongoing_incident" to True in coreesponding row
complet_df.head()# ATTENTION : il peut y avoir plusieurs lignes pour un même incident et pour un même time_slot
complet_df.shape

(4020, 10)

#### Merging complet_df with holidays and public holiday

In [11]:
holiday_df = pd.read_csv("school_holiday_calendar.csv")
complet_df = complet_df.merge(holiday_df, on = "date", how = "left")

In [12]:
public_holidays = pd.read_csv("jours_feries_metropole.csv")
public_holidays = public_holidays[["date","nom_jour_ferie"]]
public_holidays["date"] = pd.to_datetime(public_holidays["date"], format="%Y-%m-%d")
public_holidays.rename(columns={"nom_jour_ferie":"public_holiday"}, inplace=True)
public_holidays["public_holiday"] = True
public_holidays["date"] = public_holidays["date"].astype(object)
public_holidays.fillna(False, inplace=True)
complet_df = complet_df.merge(public_holidays, on = "date", how = "left", )

Fill NaN

In [13]:
columns = public_holidays.columns.values.tolist() + complet_df.columns.values.tolist()
columns = list(set(columns))
columns.remove("date")
complet_df[columns] = complet_df[columns].fillna(value=False)

In [None]:
#Convertire pd.parse_date