#### Importing librairies

In [1]:
import pandas as pd
import datetime

#### Importing and processing incidents

In [2]:
start_incidents = pd.read_csv("Incidents data/start_incidents_2018_01_01_2023_08_28.csv")
start_incidents = start_incidents[['UserName','Timestamp', 'Embedded_text','Emojis','Tweet URL']]
start_incidents.insert(0, 'start/end', 'start')
start_incidents['Timestamp'] = pd.to_datetime(start_incidents['Timestamp']) + datetime.timedelta(hours=1) # Add one hour to Timestamp in order to convert UTC to Paris time
start_incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL
0,start,@Ligne9_RATP,2018-01-01 02:56:08+00:00,"02:53, la rame stationne à Nation en dir. de M...",,https://twitter.com/Ligne9_RATP/status/9476475...
1,start,@Ligne9_RATP,2018-01-01 19:14:08+00:00,"19:11, la rame stationne à Cx Chavaux en dir. ...",,https://twitter.com/Ligne9_RATP/status/9478937...
2,start,@Ligne9_RATP,2018-01-02 12:06:09+00:00,"12:02, le trafic est interrompu entre Rue des ...",,https://twitter.com/Ligne9_RATP/status/9481484...
3,start,@Ligne9_RATP,2018-01-02 18:05:08+00:00,"18:03, le trafic est perturbé sur la ligne (in...",,https://twitter.com/Ligne9_RATP/status/9482387...
4,start,@Ligne9_RATP,2018-01-02 23:38:08+00:00,"23:35, le trafic est interrompu entre Trocader...",,https://twitter.com/Ligne9_RATP/status/9483225...


In [3]:
end_incidents = pd.read_csv("Incidents data/end_incidents_2018_01_01_2023_08_28.csv")
end_incidents = end_incidents[['UserName','Timestamp', 'Embedded_text','Emojis','Tweet URL']]
end_incidents.insert(0, 'start/end', 'end')
end_incidents['Timestamp'] = pd.to_datetime(end_incidents['Timestamp']) + datetime.timedelta(hours=1) # Add one hour to Timestamp in order to convert UTC to Paris time
end_incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL
0,end,@Ligne9_RATP,2018-01-01 02:22:11+00:00,"Incident terminé (personne sur les voies), ret...",,https://twitter.com/Ligne9_RATP/status/9476390...
1,end,@Ligne9_RATP,2018-01-01 02:46:17+00:00,Retour à un trafic régulier sur l'ensemble de ...,,https://twitter.com/Ligne9_RATP/status/9476451...
2,end,@Ligne9_RATP,2018-01-02 12:11:08+00:00,"12:09, le trafic reprend progressivement (pann...",,https://twitter.com/Ligne9_RATP/status/9481496...
3,end,@Ligne9_RATP,2018-01-02 12:24:06+00:00,Retour à un trafic régulier sur l'ensemble de ...,,https://twitter.com/Ligne9_RATP/status/9481529...
4,end,@Ligne9_RATP,2018-01-02 18:26:07+00:00,Incident terminé.Retour à un trafic normal sur...,,https://twitter.com/Ligne9_RATP/status/9482440...


#### Merging start and end incidents

In [4]:
incidents = pd.concat([start_incidents, end_incidents])
incidents["Timestamp"] = incidents["Timestamp"].apply(lambda x: str(x))
incidents.sort_values(by=['Timestamp'], inplace=True)
incidents.reset_index(inplace=True)
incidents.drop(columns=['index'], inplace=True)
incidents["date"] = incidents["Timestamp"].apply(lambda x: x.split(" ")[0])
incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL,date
0,end,@Ligne9_RATP,2018-01-01 02:22:11+00:00,"Incident terminé (personne sur les voies), ret...",,https://twitter.com/Ligne9_RATP/status/9476390...,2018-01-01
1,end,@Ligne9_RATP,2018-01-01 02:46:17+00:00,Retour à un trafic régulier sur l'ensemble de ...,,https://twitter.com/Ligne9_RATP/status/9476451...,2018-01-01
2,start,@Ligne9_RATP,2018-01-01 02:56:08+00:00,"02:53, la rame stationne à Nation en dir. de M...",,https://twitter.com/Ligne9_RATP/status/9476475...,2018-01-01
3,start,@Ligne9_RATP,2018-01-01 19:14:08+00:00,"19:11, la rame stationne à Cx Chavaux en dir. ...",,https://twitter.com/Ligne9_RATP/status/9478937...,2018-01-01
4,start,@Ligne9_RATP,2018-01-02 12:06:09+00:00,"12:02, le trafic est interrompu entre Rue des ...",,https://twitter.com/Ligne9_RATP/status/9481484...,2018-01-02


In [5]:
def timestamp_to_time_slot(timestamp : str) :
    """
    Return the time slot of a given timestamp.
    """
    time_slot = int(timestamp[11:13])//2

    time_slots = ["0h-2h","2h-4h","4h-6h","6h-8h","8h-10h","10h-12h","12h-14h","14h-16h","16h-18h","18h-20h","20h-22h","22h-24h"]
    time_slots_dict = {i : time_slots[i] for i in range(len(time_slots))}    
    
    return time_slots_dict[time_slot]

In [6]:
incidents["time_slot"] = incidents["Timestamp"].apply(timestamp_to_time_slot)
incidents.head()

Unnamed: 0,start/end,UserName,Timestamp,Embedded_text,Emojis,Tweet URL,date,time_slot
0,end,@Ligne9_RATP,2018-01-01 02:22:11+00:00,"Incident terminé (personne sur les voies), ret...",,https://twitter.com/Ligne9_RATP/status/9476390...,2018-01-01,2h-4h
1,end,@Ligne9_RATP,2018-01-01 02:46:17+00:00,Retour à un trafic régulier sur l'ensemble de ...,,https://twitter.com/Ligne9_RATP/status/9476451...,2018-01-01,2h-4h
2,start,@Ligne9_RATP,2018-01-01 02:56:08+00:00,"02:53, la rame stationne à Nation en dir. de M...",,https://twitter.com/Ligne9_RATP/status/9476475...,2018-01-01,2h-4h
3,start,@Ligne9_RATP,2018-01-01 19:14:08+00:00,"19:11, la rame stationne à Cx Chavaux en dir. ...",,https://twitter.com/Ligne9_RATP/status/9478937...,2018-01-01,18h-20h
4,start,@Ligne9_RATP,2018-01-02 12:06:09+00:00,"12:02, le trafic est interrompu entre Rue des ...",,https://twitter.com/Ligne9_RATP/status/9481484...,2018-01-02,12h-14h


#### Initialise negatif_df

In [7]:
columns = ["month_number","week_number","monthday_number","weekday_number","time_slot","ongoing_incident"]

In [8]:
# Pourrait être remplacé par : pandas.date_range() ???

def generate_time_stamps(start_day : str, end_day: str,time_ressolution = 2) :
    """
    Generate a list of time stamps between <start_day> and <end_day> (all day include) with a given time resolution.
    """
    start_day = datetime.datetime.strptime(start_day, '%Y-%m-%d')
    end_day = datetime.datetime.strptime(end_day, '%Y-%m-%d') + datetime.timedelta(days=1)
    time_stamps = []
    
    time_stamp = start_day

    while time_stamp < end_day :
        time_stamps.append(time_stamp)
        time_stamp += datetime.timedelta(hours=time_ressolution)


    return time_stamps

time_stamps = [datetime.datetime.strftime(time_stamp, '%Y-%m-%d %H:%M:%S') for time_stamp in generate_time_stamps("2018-01-01", "2023-08-28")]

In [9]:
negatif_df = pd.DataFrame(time_stamps, columns=["timestamp"])
negatif_df["month_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').month)
negatif_df["week_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').isocalendar().week)
negatif_df["monthday_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').day)
negatif_df["weekday_number"] = negatif_df["timestamp"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday())
negatif_df["time_slot"] = negatif_df["timestamp"].apply(timestamp_to_time_slot)
negatif_df.insert(1, 'date', negatif_df["timestamp"].apply(lambda x : x[:10]))
negatif_df["ongoing_incident"] = False

negatif_df.head()

Unnamed: 0,timestamp,date,month_number,week_number,monthday_number,weekday_number,time_slot,ongoing_incident
0,2018-01-01 00:00:00,2018-01-01,1,1,1,0,0h-2h,False
1,2018-01-01 02:00:00,2018-01-01,1,1,1,0,2h-4h,False
2,2018-01-01 04:00:00,2018-01-01,1,1,1,0,4h-6h,False
3,2018-01-01 06:00:00,2018-01-01,1,1,1,0,6h-8h,False
4,2018-01-01 08:00:00,2018-01-01,1,1,1,0,8h-10h,False


#### Pass line with incident to True in negatif_df

In [10]:
complet_df = pd.merge(negatif_df, incidents[["date","time_slot","Embedded_text", "Tweet URL"]], how="left", on=["date","time_slot"])
complet_df.drop_duplicates(subset=["timestamp","time_slot"], inplace=True, keep="first")
complet_df["ongoing_incident"] = complet_df["Embedded_text"].apply(lambda x : False if pd.isna(x) else True) # if "Embedded_text" is not NaN, set "ongoing_incident" to True in coreesponding row
complet_df.head()# ATTENTION : il peut y avoir plusieurs lignes pour un même incident et pour un même time_slot
complet_df.shape

(24792, 10)

#### Merging complet_df with holidays and public holiday

In [11]:
holiday_df = pd.read_csv("school_holiday_calendar.csv")
complet_df = complet_df.merge(holiday_df, on = "date", how = "left")

In [12]:
public_holidays = pd.read_csv("jours_feries_metropole.csv")
public_holidays = public_holidays[["date","nom_jour_ferie"]]
public_holidays["date"] = pd.to_datetime(public_holidays["date"], format="%Y-%m-%d")
public_holidays.rename(columns={"nom_jour_ferie":"public_holiday"}, inplace=True)
public_holidays["public_holiday"] = True
public_holidays["date"] = public_holidays["date"].astype(object)
public_holidays.fillna(False, inplace=True)
complet_df = complet_df.merge(public_holidays, on = "date", how = "left", )

Fill NaN

In [13]:
columns = public_holidays.columns.values.tolist() + complet_df.columns.values.tolist()
columns = list(set(columns))
columns.remove("date")
complet_df[columns] = complet_df[columns].fillna(value=False)

#### Adding weather data

In [14]:
weather = pd.read_csv("weather.csv")
weather.head()

Unnamed: 0,forecast dt iso,slice dt iso,temperature,dew_point,pressure,ground_pressure,humidity,clouds,wind_speed,wind_deg,rain,snow,ice,fr_rain,convective,snow_depth,accumulated,rate,time_slot,date
0,2017-10-07 18:00:00,2017-10-08 01:00:00,13.595,11.56,1017.63,1007.52,87.465,80.5,4.04,268.91,0.0,0.0,0.0,0.0,0.084,0.0,0.0,1.1e-05,0h-2h,2017-10-08
1,2017-10-07 18:00:00,2017-10-08 03:00:00,13.845,11.12,1017.63,1007.495,83.725,78.0,3.195,291.86,0.0,0.0,0.0,0.0,0.105,0.0,0.0,2.2e-05,2h-4h,2017-10-08
2,2017-10-07 18:00:00,2017-10-08 05:00:00,13.545,10.705,1017.97,1007.84,82.965,83.5,2.355,308.715,0.0,0.0,0.0,0.0,0.126,0.0,0.0,1.9e-05,4h-6h,2017-10-08
3,2017-10-07 18:00:00,2017-10-08 07:00:00,13.265,10.65,1018.7,1008.635,83.985,91.0,1.805,314.19,0.0,0.0,0.0,0.0,0.208,0.0,0.0,6e-06,6h-8h,2017-10-08
4,2017-10-07 18:00:00,2017-10-08 09:00:00,13.86,10.59,1019.58,1009.625,80.565,97.0,1.735,312.2,0.0,0.0,0.0,0.0,0.125,0.0,0.0,6e-06,8h-10h,2017-10-08


In [15]:
complet_df = complet_df.merge(weather, on = ["date","time_slot"], how = "left")

#### Saving complet_df as csv file

In [16]:
drop_columns = ['timestamp', 'date','Embedded_text', 'Tweet URL','forecast dt iso','slice dt iso']
complet_df.drop(columns=drop_columns, inplace=True)
complet_df = complet_df[[col for col in complet_df if col not in ['ongoing_incident']] + ['ongoing_incident']] # Move "ongoing_incident" column to the end of the dataframe

In [17]:
complet_df.columns

Index(['month_number', 'week_number', 'monthday_number', 'weekday_number',
       'time_slot', 'holiday_departure_zone_A', 'holiday_departure_zone_B',
       'holiday_departure_zone_C', 'first_day_holidays_zone_A',
       'first_day_holidays_zone_B', 'first_day_holidays_zone_C',
       'holiday_day_zone_A', 'holiday_day_zone_B', 'holiday_day_zone_C',
       'last_day_holidays_zone_A', 'last_day_holidays_zone_B',
       'last_day_holidays_zone_C', 'public_holiday', 'temperature',
       'dew_point', 'pressure', 'ground_pressure', 'humidity', 'clouds',
       'wind_speed', 'wind_deg', 'rain', 'snow', 'ice', 'fr_rain',
       'convective', 'snow_depth', 'accumulated', 'rate', 'ongoing_incident'],
      dtype='object')

In [18]:
complet_df.to_csv("complet_df.csv", index=False)