In [12]:
import pandas as pd

In [13]:
url = r"Raw Data\que-faire-a-paris-.csv"
events = pd.read_csv(url, sep = ";")

In [14]:
# Remove columns that have more than 1000 null values

def drop_col_null_values(df):
    cols_to_drop = [col for col in df.columns if df[col].isnull().sum() > 1000]
    return df.drop(columns=cols_to_drop)


In [15]:
events = drop_col_null_values(events)

In [16]:
# Look at the remaining columns with highest number null values and decide to drop them or not
# title_event is a duplicate of title
# ID column is not necessary if we already have event_id
events.drop(columns=['ID','title_event', 'contact_organisation_name', "Crédit de l'image", 'group', 'locale', 'rank', 'weight', 'event_pets_allowed', "URL de l'image"], axis=1, inplace=True)


In [17]:
events.duplicated().value_counts()

False    2729
Name: count, dtype: int64

In [18]:
events.dropna(subset=['Coordonnées géographiques'], inplace=True)

In [19]:
localisation = events[['locations', 'Adresse du lieu', 'Coordonnées géographiques']].sort_values('Adresse du lieu')
events.drop(columns=['locations'], axis=1, inplace=True)

In [20]:
events = events.copy()
events = events[events['Date de fin'] > '2025-06-01']

In [22]:
split_themes = events['qfap_tags'].str.split(';', expand=True)
split_themes.columns = [f'theme{i+1}' for i in range(split_themes.shape[1])]
events = pd.concat([events, split_themes], axis=1)

In [24]:
events.shape

(2632, 23)

In [25]:
events.drop('qfap_tags', axis=1, inplace=True)

In [26]:
events[['latitude', 'longitude']] = events['Coordonnées géographiques'].str.split(',', expand=True)

In [None]:
events.drop('Coordonnées géographiques', axis=1, inplace=True)

In [27]:
import unicodedata

def clean_column_name(col):
    # Normalize accents and remove diacritics
    col = unicodedata.normalize('NFKD', col).encode('ascii', 'ignore').decode('utf-8')
    # Lowercase, replace apostrophes, remove punctuation, replace spaces
    col = (
        col.lower()
        .replace("'", "_")
        .replace("-", "_")
    )
    col = re.sub(r'[^\w\s]', '', col)      # remove punctuation
    col = re.sub(r'\s+', '_', col)         # replace whitespace with _
    return col

import re
events.columns = [clean_column_name(col) for col in events.columns]

In [28]:
events[events['date_de_debut'] == '2530-02-27T19:00:00+01:00']

Unnamed: 0,event_id,url,titre,chapeau,description,date_de_debut,date_de_fin,occurrences,description_de_la_date,nom_du_lieu,...,type_de_prix,type_d_acces,date_de_mise_a_jour,audience,event_indoor,theme1,theme2,theme3,latitude,longitude
332,65667,https://www.paris.fr/evenements/venez-rencontr...,Venez rencontrer les Louves du Polar à la Fnac...,Les autrices francophones de romans noirs sont...,<h2>SANDRINE COHEN</h2><p><strong>Sandrine Coh...,2530-02-27T19:00:00+01:00,2530-02-27T19:30:00+01:00,2530-02-27T18:00:00+01:00_2530-02-27T18:30:00+...,Le lundi 27 février 2530<br />de 18h00 à 18h30...,Fnac Montparnasse,...,gratuit,non,2025-05-20T19:14:13+02:00,Public jeunes et adultes.,1,Littérature,,,48.8459520129701,2.3257039591324


In [29]:
events = events[events['date_de_debut'] != '2530-02-27T19:00:00+01:00']

In [None]:
# events = events.copy()
# events['date_de_debut'] = pd.to_datetime(events['date_de_debut'], errors='coerce', utc=True)
# events['date_de_fin'] = pd.to_datetime(events['date_de_fin'], errors='coerce', utc=True)


In [30]:
lengths = events['occurrences'].fillna('').str.split(';').apply(len)
events = events[lengths <= 150].copy()

In [31]:
new_cols = events['occurrences'].str.split(';', expand=True)

In [33]:
# Split into new columns
new_cols = events['occurrences'].str.split(';', expand=True)

# Generate column names: date1, date2, ..., date150
col_names = [f'date{i+1}' for i in range(new_cols.shape[1])]

# Assign back to events with these names
events = events.copy()
events[col_names] = new_cols


In [34]:
# List of the 150 column names
date_cols = [f'date{i}' for i in range(1, 151)]

# Apply the split to each column
for col in date_cols:
    events[col] = events[col].str.split('_').str[0]

In [35]:
events.head(3)

Unnamed: 0,event_id,url,titre,chapeau,description,date_de_debut,date_de_fin,occurrences,description_de_la_date,nom_du_lieu,...,date141,date142,date143,date144,date145,date146,date147,date148,date149,date150
2,4283,https://www.paris.fr/evenements/la-folle-et-in...,"La folle et inconvenante histoire des femmes, ...","Une immersion drôle, détonante et décalée dans...",<p>Ce spectacle met en scène une jeune femme q...,2023-10-24T00:00:00+02:00,2026-08-07T01:15:00+02:00,2026-08-06T22:00:00+02:00_2026-08-06T23:15:00+...,Le jeudi 06 août 2026<br />de 21h00 à 22h15<br />,Le funambule montmartre,...,,,,,,,,,,
3,51498,https://www.paris.fr/evenements/signatures-ave...,Signatures avec Dominique et Alexandra Duvivier,Encore un nouveau spectacle du duo mythique de...,<p>Peut-être tout simplement la magie de vous ...,2024-10-12T00:00:00+02:00,2025-07-12T01:15:00+02:00,2025-06-27T22:00:00+02:00_2025-06-27T23:15:00+...,Le vendredi 11 juillet 2025<br />de 21h00 à 22...,Théâtre Le Double Fond,...,,,,,,,,,,
4,51918,https://www.paris.fr/evenements/le-milieu-des-...,"« Le Milieu des choses », une œuvre de Javier ...","En juin 2024, Black Swan Real Estate Capital, ...","<p><strong>Avec cette réalisation, l’approche ...",2024-06-01T02:00:00+02:00,2030-06-02T01:59:59+02:00,,Du samedi 01 juin 2024 au samedi 01 juin 2030 ...,Bureaux Panache,...,,,,,,,,,,


In [36]:
events.shape

(2592, 174)

In [None]:
# events.to_csv('events_in_paris.csv', index=False)