### Data Survey

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\PC\Desktop\Estudio\Analisis de Datos\Proyectos\Festival Purchase Behavior Analysis\Datasets\festival_dataset_dirty_modified.csv")

# This part of the code provides an overview of the dataset, spaced, so each function is easier to read.

print(df.shape)
print("\n")
print(df.isnull().sum())
print("\n")
print(df.duplicated().sum())
print("\n")
print(df.nunique())
print("\n")

# I'm selecting columns of type 'object' (text) or 'category' to analyze unique values in those columns.
text_columns = df.select_dtypes(include=['object', 'category']).columns

# Displaying unique values for each text column
# With it, we can see the unique values in each text column, which helps us understand the dataset better
# And detect typos or inconsistencies in the data.
for col in text_columns:
    print(f"\nUnique values of '{col}':")
    print(df[col].unique())

(14000, 26)


Unnamed: 0               0
ticket_id                0
ticket_type            280
ticket_price             0
purchase_date            0
attendance_date          0
entry_time               0
was_present              0
attendee_id              0
age                      0
gender                  97
origin_city              0
transport_used           0
group_size               0
food_expense             0
drink_expense            0
merch_expense            0
payment_method           0
favourite_genre          0
stages_visited           0
top_artist_seen          0
hours_spent              0
satisfaction_score       0
security_rating          0
cleanliness_rating       0
recommend_to_friend      0
dtype: int64


0


Unnamed: 0             14000
ticket_id               9319
ticket_type                3
ticket_price               3
purchase_date             89
attendance_date            3
entry_time               540
was_present                1
attendee_id             9319
age 

With this first glimpse we recognise:

1. Columns "gender" and "ticket_type" have null values.
2. No duplicated rows.
3. Unnecesary columns.
4. Typos in columns "payment_method", "favourite_genre", "recommend_to_friend"

### Removing non-essential fields

In [3]:
df = df.drop(["ticket_id",
         "attendee_id",
         "entry_time", 
         "purchase_date", 
         "was_present", 
         "transport_used",
         "top_artist_seen", 
         "origin_city"], axis=1)

### Null values handling

In [4]:
# --- Gender column ---

# Count the number of unique values in each column
# Used to understand the dataset better
print(df['gender'].value_counts(dropna=False))
print("\n")

# gender_dist will store the normalized distribution
gender_dist = df["gender"].value_counts(normalize=True)

# mask will be used to locate the null values in the dataset
mask = df["gender"].isnull()

# Adds all the null
n_nulls = mask.sum()

# Fill the null values with random choices based on the distribution
# This will ensure that the null values are filled in a way that reflects the original distribution
df.loc[mask, "gender"] = np.random.choice(
    gender_dist.index,
    size=n_nulls,
    p=gender_dist.values
)

# --- Ticket Type column ---
# Same steps to clean "ticket_type" column as we followed for "gender" column
print(df["ticket_type"].value_counts(dropna=False))
print("---------------------------------")

type_dist = df["ticket_type"].value_counts(normalize=True)
mask = df["ticket_type"].isnull()
n_nulls = mask.sum()
df.loc[mask, "ticket_type"] = np.random.choice(
    type_dist.index,
    size = n_nulls,
    p=type_dist.values
)


print(df["gender"].value_counts(dropna=False))
print("\n")
print(df["ticket_type"].value_counts(dropna=False))

gender
Cash      3711
Female    3131
Male      3072
Other     3062
cash       927
NaN         97
Name: count, dtype: int64


ticket_type
3-day Pass    8158
1-day Pass    2782
VIP           2780
NaN            280
Name: count, dtype: int64
---------------------------------
gender
Cash      3741
Female    3151
Male      3090
Other     3084
cash       934
Name: count, dtype: int64


ticket_type
3-day Pass    8333
VIP           2835
1-day Pass    2832
Name: count, dtype: int64


### Making *"ticket_price"* consistent

In the original dataset the prices have been assigned inconsistently. For example, some entries labeled as “VIP” have a price of 80, and some “1-Day” tickets show a price of 350. 

This code ensures that, regardless of how they appeared in the raw data, all “1-Day” tickets become 80, all “3-Day” tickets become 210, and all “VIP” tickets become 350.

In [5]:
# I will fix inconsistencies in the ticket prices
# I will make sure that each ticket type has its corresponding price
for i in range(len(df)):
    if df.loc[i, "ticket_type"] == '1-Day':
        df.loc[i, "ticket_price"] = 80
    elif df.loc[i, "ticket_type"] == '3-Day':
        df.loc[i, "ticket_price"] = 210
    elif df.loc[i, "ticket_type"] == 'VIP':
        df.loc[i, "ticket_price"] = 350

### Typos cleaning

During my *Data Survey* I found these typos.

In [6]:
df["payment_method"] = df["payment_method"].replace({"cash ": "Cash"})
df["favourite_genre"] = df["favourite_genre"].replace("hiphop", "Hip-Hop")
df["favourite_genre"] = df["favourite_genre"].replace("Regueton", "Reggaeton")
df["recommend_to_friend"] = df["recommend_to_friend"].replace({"nO": "No"})

### Trimming values with spaces

In [7]:
# Strips leading and trailing whitespace from all string columns
for col in df.columns:
    if df[col].dtype == 'object':
      # It will convert to string, then strip whitespace
        df[col] = df[col].str.strip()
      # It will replace multiple spaces with a single space
        df[col] = df[col].str.replace(r'\s+', ' ', regex=True)

### Type convertion

In [8]:
# With it, we ensure that the data types are appropriate for analysis and optimize memory usage
df = df.astype({
    'ticket_type': 'category',
    'ticket_price': 'int',
    'age': 'int',
    'gender': 'category',
    'group_size': 'int',
    'food_expense': 'float',
    'drink_expense': 'float',
    'merch_expense': 'float',
    'payment_method': 'category',
    'favourite_genre': 'category',
    'stages_visited': 'int',
    'satisfaction_score': 'int',
    'security_rating': 'int',
    'cleanliness_rating': 'int',
    'recommend_to_friend': 'bool'
})

# Date conversion
df['attendance_date'] = pd.to_datetime(df['attendance_date'])


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Unnamed: 0           14000 non-null  int64         
 1   ticket_type          14000 non-null  category      
 2   ticket_price         14000 non-null  int64         
 3   attendance_date      14000 non-null  datetime64[ns]
 4   age                  14000 non-null  int64         
 5   gender               14000 non-null  category      
 6   group_size           14000 non-null  int64         
 7   food_expense         14000 non-null  float64       
 8   drink_expense        14000 non-null  float64       
 9   merch_expense        14000 non-null  float64       
 10  payment_method       14000 non-null  category      
 11  favourite_genre      14000 non-null  category      
 12  stages_visited       14000 non-null  int64         
 13  hours_spent          14000 non-