### Data Survey

In [50]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\PC\Desktop\Estudio\Analisis de Datos\Proyectos\Festival Purchase Behavior Analysis\Datasets\festival_dataset_dirty.csv")

# This part of the code provides an overview of the dataset, spaced, so each function is easier to read.

print(df.shape)
print("\n")
print(df.isnull().sum())
print("\n")
print(df.duplicated().sum())
print("\n")
print(df.nunique())
print("\n")

# I'm selecting columns of type 'object' (text) or 'category' to analyze unique values in those columns.
text_columns = df.select_dtypes(include=['object', 'category']).columns

# Displaying unique values for each text column
# With it, we can see the unique values in each text column, which helps us understand the dataset better
# And detect typos or inconsistencies in the data.
for col in text_columns:
    print(f"\nUnique values of '{col}':")
    print(df[col].unique())

(14000, 25)


ticket_id                0
ticket_type            280
ticket_price             0
purchase_date            0
attendance_date          0
entry_time               0
was_present              0
attendee_id              0
age                      0
gender                 140
origin_city              0
transport_used           0
group_size               0
food_expense             0
drink_expense            0
merch_expense            0
payment_method           0
favourite_genre          0
stages_visited           0
top_artist_seen          0
hours_spent              0
satisfaction_score       0
security_rating          0
cleanliness_rating       0
recommend_to_friend      0
dtype: int64


0


ticket_id              9319
ticket_type               3
ticket_price              3
purchase_date            89
attendance_date           3
entry_time              540
was_present               1
attendee_id            9319
age                      42
gender                    3
origin_city 

With this first glimpse we recognise:

1. Columns "gender" and "ticket_type" have null values.
2. No duplicated rows
3. ticket_id (which will be the main ID) is repeated along the dataset.
4. Typos in columns "payment_method", "favourite_genre", "recommend_to_friend"

### Null values management

In [51]:
# --- Gender column ---
# Null values management

# Count the number of unique values in each column
# Used to understand the dataset better
print(df['gender'].value_counts(dropna=False))
print("\n")

# gender_dist will store the normalized distribution
gender_dist = df["gender"].value_counts(normalize=True)

# mask will be used to locate the null values in the dataset
mask = df["gender"].isnull()

# Adds all the null
n_nulls = mask.sum()

# Fill the null values with random choices based on the distribution
# This will ensure that the null values are filled in a way that reflects the original distribution
df.loc[mask, "gender"] = np.random.choice(
    gender_dist.index,
    size=n_nulls,
    p=gender_dist.values
)

# --- Ticket Type column ---
# Same steps to clean "ticket_type" column as we followed for "gender" column
print(df["ticket_type"].value_counts(dropna=False))
print("---------------------------------")

type_dist = df["ticket_type"].value_counts(normalize=True)
mask = df["ticket_type"].isnull()
n_nulls = mask.sum()
df.loc[mask, "ticket_type"] = np.random.choice(
    type_dist.index,
    size = n_nulls,
    p=type_dist.values
)


print(df["gender"].value_counts(dropna=False))
print("\n")
print(df["ticket_type"].value_counts(dropna=False))

gender
Female    4709
Other     4609
Male      4542
NaN        140
Name: count, dtype: int64


ticket_type
3-day Pass    8158
1-day Pass    2782
VIP           2780
NaN            280
Name: count, dtype: int64
---------------------------------
gender
Female    4757
Other     4656
Male      4587
Name: count, dtype: int64


ticket_type
3-day Pass    8334
1-day Pass    2842
VIP           2824
Name: count, dtype: int64


### Typos cleaning

In [52]:
# --- Typos cleaning ---

df["payment_method"] = df["payment_method"].replace({"cash ": "Cash"})
df["favourite_genre"] = df["favourite_genre"].replace("hiphop", "Hip-Hop")
df["favourite_genre"] = df["favourite_genre"].replace("Regueton", "Reggaeton")
df["recommend_to_friend"] = df["recommend_to_friend"].replace({"nO": "No"})

### Trimming values with spaces

In [53]:
# Strips leading and trailing whitespace from all string columns
for col in df.columns:
    if df[col].dtype == 'object':
      # It will convert to string, then strip whitespace
        df[col] = df[col].str.strip()
      # It will replace multiple spaces with a single space
        df[col] = df[col].str.replace(r'\s+', ' ', regex=True)

### Removing non-essential fields

In [54]:
df.drop(["ticket_id",
         "attendee_id",
         "entry_time", 
         "purchase_date", 
         "was_present", 
         "transport_used",
         "top_artist_seen", 
         "origin_city"], axis=1)

Unnamed: 0,ticket_type,ticket_price,attendance_date,age,gender,group_size,food_expense,drink_expense,merch_expense,payment_method,favourite_genre,stages_visited,hours_spent,satisfaction_score,security_rating,cleanliness_rating,recommend_to_friend
0,3-day Pass,200,2025-05-02,20,Male,2,15.38,50.74,78.35,Cash,Pop,3,8.6,6.5,8.0,9.5,Yes
1,3-day Pass,150,2025-05-03,33,Female,3,27.45,57.04,48.70,Festival App,Pop,1,7.3,8.0,6.5,6.5,Yes
2,VIP,200,2025-05-03,35,Male,5,12.38,45.05,56.57,Festival App,Rock,1,5.6,9.5,5.0,8.0,Yes
3,3-day Pass,200,2025-05-01,50,Other,3,47.83,53.21,10.78,Card,Hip-Hop,1,5.9,8.0,8.0,6.5,No
4,3-day Pass,80,2025-05-01,27,Female,5,37.65,18.95,11.05,Festival App,Hip-Hop,4,8.0,8.0,8.0,9.5,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,1-day Pass,80,2025-05-02,28,Male,4,8.73,34.53,61.05,Card,Pop,1,8.7,8.0,9.5,5.0,Yes
13996,3-day Pass,200,2025-05-01,47,Male,5,6.45,34.89,94.09,Card,Pop,3,3.6,6.5,6.5,8.0,Yes
13997,1-day Pass,150,2025-05-01,28,Male,2,38.53,16.60,64.87,Card,Rock,2,11.8,9.5,9.5,5.0,No
13998,VIP,200,2025-05-02,28,Male,1,27.26,34.59,91.61,Festival App,Reggaeton,2,5.1,8.0,9.5,9.5,Yes


### Type convertion

In [55]:
# With it, we ensure that the data types are appropriate for analysis and optimize memory usage
df = df.astype({
    'ticket_type': 'category',
    'ticket_price': 'int',
    'age': 'int',
    'gender': 'category',
    'group_size': 'int',
    'food_expense': 'float',
    'drink_expense': 'float',
    'merch_expense': 'float',
    'payment_method': 'category',
    'favourite_genre': 'category',
    'stages_visited': 'int',
    'satisfaction_score': 'int',
    'security_rating': 'int',
    'cleanliness_rating': 'int',
    'recommend_to_friend': 'bool'
})

# Date conversion
df['attendance_date'] = pd.to_datetime(df['attendance_date'])


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ticket_id            14000 non-null  object        
 1   ticket_type          14000 non-null  category      
 2   ticket_price         14000 non-null  int64         
 3   purchase_date        14000 non-null  object        
 4   attendance_date      14000 non-null  datetime64[ns]
 5   entry_time           14000 non-null  object        
 6   was_present          14000 non-null  bool          
 7   attendee_id          14000 non-null  object        
 8   age                  14000 non-null  int64         
 9   gender               14000 non-null  category      
 10  origin_city          14000 non-null  object        
 11  transport_used       14000 non-null  object        
 12  group_size           14000 non-null  int64         
 13  food_expense         14000 non-