In [None]:
#import all libraries to be used in the exercise
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from IPython.display import display, Markdown, Image
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)


In [None]:
#show the files in the folder I´m working in. This will help me to check that I have the right files to load.
print(os.listdir()) 

In [None]:
#Upload the files
df_flights = pd.read_csv('Customer Flight Activity.csv' , sep=',')
df_loyalty = pd.read_csv('Customer Loyalty History.csv' , sep=',')

In [None]:
#Show the construccion of each dataframe
print(df_flights.info())
print(df_loyalty.info())

In [None]:
#Verify the index of the dataframes is a custom index or a predefined numeric index
print(df_flights.index)
print(df_loyalty.index)

In [None]:
#Displays the first few rows of the dataframes so its structure can be checked
display(df_flights.head())
display(df_loyalty.head())

In [None]:
#Check if the columns names are correct
print(df_flights.columns)
print(df_loyalty.columns)

In [None]:
#Standardise column names in both dataframes, remove blank spaces, convert everything to lower case and replace spaces with underscores
df_flights.columns = df_flights.columns.str.strip().str.lower().str.replace(" ", "_")
df_loyalty.columns = df_loyalty.columns.str.strip().str.lower().str.replace(" ", "_")

print(df_flights.columns)
print(df_loyalty.columns)

In [None]:
#Show how many unique values there are in each column of the dataframes
print("Number of unique values in Customer Flight Analysis:")
print(df_flights.nunique())

print("Number of unique values in Customer Loyalty History:")
print(df_loyalty.nunique())

In [None]:
print("unique values in columm year:", df_flights['year'].unique())
print("unique values in columm month:", df_flights['month'].unique())

In [None]:
#Check all the duplicated rows and count how many there are
print(df_flights.duplicated().sum())   #means that there are 1864 duplicated rows que the same values.
print(df_loyalty.duplicated().sum())

In [None]:
df_flights[df_flights.duplicated(keep=False)].head(50)

In [None]:
#Remove duplicates that are exactly the same, removing the extra copies if all columns are the same
#and reset the index to avoid gaps after removing duplicates
df_flights.drop_duplicates(inplace=True)
df_flights.reset_index(drop=True, inplace=True)


In [None]:
#Run this code to check that the duplicates have been removed correctly. If it returns 0, there are no exact matches.
print(df_flights.duplicated().sum())


In [None]:
#To manage duplicates with differences in some columns, we search for records that have the same loyalty_number, year and month, 
#but with different values in other columns.
df_flights[df_flights.duplicated(subset=['loyalty_number', 'year', 'month'], keep=False)]

In [None]:
#Merge duplicate values
df_flights = df_flights.groupby(['loyalty_number', 'year', 'month']).agg({
    "flights_booked": "sum",
    "flights_with_companions": "sum",
    "total_flights": "sum",
    "distance": "sum",
    "points_accumulated": "sum",
    "points_redeemed": "sum",
    "dollar_cost_points_redeemed": "sum"}).reset_index()


In [None]:
#Select another option, like keeping only the rows with the most points. The code will be:
#df_flights = df_flights.sort_values(by='points_accumulated', ascending=False).drop_duplicates(subset=['loyalty_number', 'year', 'month'], keep='first')


In [None]:
print(df_flights.info())

In [None]:
#I am interested in seeing the unique values to test my hypothesis, even though we already know that there are no duplicates in the customer loyalty history.
text_columns = df_loyalty.select_dtypes(include=['object']).columns
for column_object in text_columns:
    print(f"The singles values of the column {column_object} are:")
    print(df_loyalty[column_object].unique())

In [None]:
#The country column has only one value, which does not matter for me, so I will delete it.
df_loyalty.drop(columns=['country'], inplace=True)
print(df_loyalty.columns)

In [None]:
#To avoid errors in further analysis, I will normalise columns with categorical values: gender, education, marital_status, loyalty_card and enrolment_type.
columns_to_standar = ['gender','education', 'marital_status', 'loyalty_card', 'enrollment_type']
df_loyalty[columns_to_standar] = df_loyalty[columns_to_standar].apply(lambda x: x.str.strip().str.lower())


In [None]:
for column in columns_to_standar:
    print(f"The singles values normalise of the column {column} are: {df_loyalty[column].unique()}")

In [None]:
#Once we have finished handling duplicates, we move on to managing nulls. 
#We have already seen that customer flight activity has no nulls, so we focus on customer loyalty history.
#First, we must see where there are nulls so we can decide how to deal with them.
df_loyalty.isnull().sum()


In [None]:
#We can calculate the percentage or make a bar chart to get an idea of the amount of nulls in the columns involved.
columns_with_nulls = ['salary', 'cancellation_year', 'cancellation_month']
nulls_ratio = np.round(df_loyalty[columns_with_nulls].isnull().mean()*100, 2)
print(nulls_ratio)

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x=nulls_ratio.index, y=nulls_ratio.values)
plt.ylabel("% of null values")
plt.title("Map of null values in Salary, Cancellation Year and Cancellation Month")
plt.show()

In [None]:
#We can also run some basic statistics to better understand how the data's distributed or to help identify problems.
df_loyalty[['salary', 'cancellation_year', 'cancellation_month']].describe()

In [None]:
columns_with_nulls = ['salary', 'cancellation_year', 'cancellation_month']
plt.figure(figsize=(6, 4))
sns.heatmap(df_loyalty[columns_with_nulls].isnull(), cmap='Blues', cbar=False, yticklabels=False)
plt.title("Map of null values in Salary, Cancellation Year and Cancellation Month")
plt.show()

In [None]:
#Planteamos la hipotesis de que los clientes con valores nulos en cancellation year y cancellation month no son errores, si no que son clientes que siguen siendo activos en el porgrama de fidelidad.
cancelled_clients = df_loyalty[['cancellation_year', 'cancellation_month']].notnull().all(axis=1).value_counts()
plt.figure(figsize=(6,4))
plt.bar(["nulls", "active"], cancelled_clients.values)
plt.xlabel("membership")
plt.ylabel("number of members")
plt.show()



In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x=df_loyalty['cancellation_year'].isnull(), y=df_loyalty['clv'])
plt.xticks([0, 1], ["unsubscrib members", "Active members"])
plt.ylabel("Customer Lifetime Value (CLV)")
plt.xlabel("Membership Status")
plt.title("Distribution of CLV between active and deleted customers")
plt.show()


In [None]:
#We replace null values with 0 to avoid problems with further analysis. Always remember that 0 means that the client is still active.
df_loyalty['cancellation_year'] = df_loyalty['cancellation_year'].fillna(0)
df_loyalty['cancellation_month'] = df_loyalty['cancellation_month'].fillna(0)


In [None]:
#We check that the replacements have been executed correctly.
df_loyalty['cancellation_year'] = df_loyalty['cancellation_year'].astype(int)
df_loyalty['cancellation_month'] = df_loyalty['cancellation_month'].astype(int)
df_loyalty[['cancellation_year', 'cancellation_month']].info()
df_loyalty[['cancellation_year', 'cancellation_month']].isnull().sum()

In [None]:
negative_salaries = (df_loyalty['salary'] < 0).sum()
print(negative_salaries)

In [None]:
df_loyalty[df_loyalty['salary'] < 0]

In [None]:
df_loyalty.loc[df_loyalty['salary'] < 0, 'salary'] = np.nan

In [None]:
new_negative_salaries = (df_loyalty['salary'] < 0).sum()
print(new_negative_salaries)

In [None]:

plt.figure(figsize=(8, 5))
plt.hist(df_loyalty['salary'].dropna(), bins=30, edgecolor='black', alpha=0.7, log=True)
plt.title("Distribución de Salary")
plt.xlabel("Salary")
plt.ylabel("Frecuencia")
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x=df_loyalty['salary'])
plt.xlabel("Salary")
plt.title("Possible outliers in salary distribution")
plt.show()


In [None]:
#To see how we can manage the null values in the salary column, we are going to make a series of comparisons with columns in the same table. 
#This will show us their relationship and whether they follow a pattern.
plt.figure(figsize=(8, 5))
sns.boxplot(x='education', y='salary', data=df_loyalty)
plt.xticks(rotation=45)
plt.title("Distribution of Salary by Education Level")
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='city', y='salary', data=df_loyalty)
plt.xticks(rotation=90)
plt.title("Salary Distribution by City ")
plt.show()


In [None]:
plt.figure(figsize=(24, 6))
sns.boxplot(x='postal_code', y='salary', data=df_loyalty)
plt.xticks(rotation=90)
plt.title("Salary Distribution by Postal Code ")
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x='salary', y='clv', data=df_loyalty, alpha=0.5)
plt.xlabel("Salary")
plt.ylabel("CLV")
plt.title("Salary and CLV ratios")
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x='enrollment_type', y='salary', data=df_loyalty, estimator=np.median)
plt.title("Median Salary by Membership")
plt.show()


In [None]:
#To substitute missing values we will use IterativeImputer as it is the most complete method to replace null values.
#These models cannot use categorical values so we will map relevant columns such as education or postal code to numerical ones before executing this technique.
education_mapping ={'High School or Below': 0,
                    'college': 1,
                    'bachelor': 2,
                    'master': 3,
                    'doctor': 4}
df_loyalty['education_number'] = df_loyalty['education'].map(education_mapping)

df_loyalty['postal_code_number'], _ = df_loyalty['postal_code'].factorize()

In [None]:
df_loyalty[['education', 'education_number', 'postal_code', 'postal_code_number']].head(10)

In [None]:
print(df_loyalty.columns)


In [None]:
df_loyalty[['salary', 'clv']].describe()

In [None]:

colums_imputer = ['salary', 'clv', 'education_number', 'postal_code_number']
imputer = IterativeImputer(max_iter=100, random_state=42)
df_loyalty[colums_imputer] = imputer.fit_transform(df_loyalty[colums_imputer])

In [None]:
df_loyalty['salary'].isnull().sum()


In [None]:
df_loyalty[['salary', 'clv']].describe()

In [None]:
#we remove the columns that we have converted to numerical columns as we no longer need them.
df_loyalty.drop(columns=['education_number', 'postal_code_number'], inplace=True)


In [None]:
#We will join the dataframes using the loyalty_number column. 
#We want to keep all the clients of df_loyalty and add the data of df_flights. So we will make a merge left join.
df_merged = df_loyalty.merge(df_flights, on='loyalty_number', how="left")

In [None]:
df_merged.info()

In [None]:
df_merged.sample(20)

In [None]:
#Export the new clean dataset to a CSV file
df_merged.to_csv("airline_loyalty_programme.csv", index=False)
print(os.listdir()) 

In [None]:
#there are columns that for the visualisation phase we don't care if they are joined, so let's proceed to join them.
df_merged['enrollment_date'] = pd.to_datetime(df_merged['enrollment_year'].astype(str) + '-' + df_merged['enrollment_month'].astype(str) + '-01', format="%Y-%m-%d")
df_merged =df_merged.drop(columns=['enrollment_year', 'enrollment_month'])
col_position = df_merged.columns.get_loc('cancellation_year')
df_merged.insert(col_position, 'enrollment_date', df_merged.pop('enrollment_date'))
df_merged = df_merged.set_index('loyalty_number')
df_merged.sample(15)



### ❓ Question 1: Distribution of the number of flights booked per month during each year

In [None]:
flights_month_and_year =df_flights.groupby(['year', 'month'])['flights_booked'].sum().reset_index()
print(flights_month_and_year)

📈 **Why do we use this chart?**  
A line chart is useful when we want to see trends or fluctuations over time. In this case, we are interested in seeing how the number of flights changes month by month, and a line connecting them will help us identify peaks in specific months.  


🔍  **What does the chart tell us?**

The graph shows the distribution of booked flights per month during the years 2017 and 2018.  
A seasonal trend is evident, with a progressive increase from February to July, followed by a decline from August to November.  
July is the month with the highest number of bookings, indicating high summer demand.  
Another peak occurs in December, likely due to the holiday or year-end season.  
2018 shows a higher number of bookings in all months compared to 2017.  


In [None]:
flights_month_and_year =df_flights.groupby(['year', 'month'])['flights_booked'].sum().reset_index()
plt.figure(figsize=(10,4))
sns.lineplot(x='month', y='flights_booked', hue='year', data=flights_month_and_year, marker="o", linewidth=2, markersize=6)
plt.title("Distribution of flights booked per month and year")
plt.xticks(ticks=range(1, 13))
plt.xlabel("Month")
plt.ylabel("Flights booked")
plt.legend(title='Year')
plt.grid(True, alpha=0.5)


📉 **Why do we use this chart?**  
A bar chart helps us compare the number of flights per month in a structured way. It clearly shows the differences between months.  

🔍  **What does the chart tell us?**

This graph represents the same data but in bar format.  
It confirms the increase in flight bookings during summer and December.  
The bars for 2018 are consistently higher than those for 2017, confirming the overall growth in reservations.  
The difference between the two years is most noticeable in June, July, and December—key months for the airline’s growth.  

In [None]:
flights_month_and_year =df_flights.groupby(['year', 'month'])['flights_booked'].sum().reset_index()
plt.figure(figsize=(10,4))
sns.barplot(x='month', y='flights_booked', hue='year', data=flights_month_and_year)
plt.title("Distribution of flights booked per month and year")
plt.xticks(ticks=range(1, 13))
plt.xlabel("Month")
plt.ylabel("Flights booked")
plt.legend(title='Year')
plt.grid(True, alpha=0.5)

🔚  **Summing-up**

Both graphs confirm a seasonal booking pattern and the overall growth in 2018 compared to 2017.  

💬  ***Strategy for the Company***

- **Optimize pricing and special offers.**  
- **Introduce discounts or promotions** during low-demand months to stimulate bookings.  
- **Adjust high-demand fares** to maximize revenue.  
- **Increase flight frequency** during peak season to take advantage of demand.  
- **Plan aircraft maintenance** during the low season to ensure operational efficiency in peak months.   

**-------------------------------------------**

### ❓ Question 2: Is there a connection between flight distance and points accumulated?

📈 **Why do we use this chart?**  
A scatter plot helps us visualize how individual points are distributed and whether there is a relationship between them.  

🔍 **What Does the Chart Tell Us?**

The graph shows the relationship between flight distance and accumulated points, differentiating between the three types of loyalty cards (Star, Nova, and Aurora).

We observe a clear relationship: the greater the distance, the more flight points are accumulated.

Different Slopes Observed
- Star (purple points) appears to grant fewer points per distance traveled.
- Aurora (orange points) seems to accumulate more points than Star.
- Nova (pink points) is the card that accumulates the most points per distance, which could indicate that it is the highest-tier card.

There are some outliers, with accumulated points significantly higher than the general distance trend.



In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x="distance", y="points_accumulated", hue="loyalty_card", data= df_merged, marker="o", palette="rocket")
plt.xlabel("Flight´s distance")
plt.ylabel("Accumulated points")
plt.title("Ratio between flight distance and accumulated points")
plt.grid(True)


📉 **Why do we use this chat?**  
A linear regression plot adds a regression line to the scatter plot, helping us see if there is a trend and to separate variables to analyze specific patterns.  


🔍 **What Does the Chart Tell Us?**

It is confirmed that there is a direct relationship between miles traveled and accumulated points.
It is also confirmed that the loyalty cards differ by status, with Star accumulating the fewest points, Aurora being an intermediate category, and Nova being the premium category.

In [None]:
sns.lmplot(x="distance", y="points_accumulated", hue="loyalty_card", data=df_merged, markers="o", height=6, aspect=1.5, palette="rocket", scatter_kws={"alpha": 0.3, "s": 50}, line_kws={"linewidth":3})
plt.xlabel("Flight´s distance")
plt.ylabel("Accumulated points")
plt.title("Ratio between flight distance and accumulated points")

🔚 **Summing-up**

Both graphs show that distance directly influences the number of points. Higher-tier customers (Nova) receive greater rewards for the same distance traveled.

💬  **Strategy for the Company**

Optimizing the Loyalty Program:

Evaluate whether the differences between loyalty cards are attractive enough to encourage customers to fly more or upgrade their level.

Introduce double points accumulation bonuses during specific low-season periods.

Create exclusive promotions that allow long-haul flights to accumulate more points, encouraging customers to choose the airline for transoceanic flights.

Implement bonus points campaigns in low-demand months for lower-tier customers.

**---------------------------------------------------------**

### ❓ Question 3: What is the distribution of clients by province and state?

In [None]:
members_province = df_loyalty["province"].value_counts()
print(members_province)


📈 **Why do we use this chart?**  
A bar chart is very clear for comparing the number of clients in each province. 

🔍  **What does the chart tell us?**

We observe the number of customers distributed by province, with a strong concentration in Ontario, BC, and Quebec. On the other hand, provinces like Prince Edward Island, Yukon, and Newfoundland have a significantly lower share compared to the main regions.  
This suggests that the airline has a highly concentrated geographical presence.  

In [None]:
plt.figure(figsize=(12, 6))
members_province.plot(kind="bar", color="pink", edgecolor="purple", linewidth=3)
plt.xlabel("Province")
plt.xticks(rotation=45)
plt.ylabel("Number of clients")
plt.title("Ratio clients by province")
plt.grid(True, alpha=0.5)


📊 **Why do we use this chart?**  
By flipping the chart and adding percentages, interpretation becomes easier, and it is useful if the number of clients varies significantly between provinces.  

🔍  **What does the chart tell us?**

Representing the relative proportion of customers in each province instead of the absolute number helps us compare it with the total population distribution in Canada (according to the 2021 Census) and assess the loyalty program's penetration in each province.  
Provinces with a negative result, such as Ontario, Quebec, or Alberta, indicate lower program penetration. This could suggest a lack of incentives or lower interest due to strong competition. Additionally, Alberta is one of the wealthiest provinces in Canada, presenting a significant opportunity to attract new customers.  
BC’s high ratio indicates strong success in the region.  
Smaller provinces show a participation level similar to their population proportion.  


In [None]:
members_percentage = (members_province / members_province.sum()) * 100
members_percentage = members_percentage.sort_values(ascending=True)
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=members_percentage, y=members_percentage.index, palette="Purples_r")
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f%%", padding=5, fontsize=10)
plt.xlabel("Percentage of Clients")
plt.ylabel("Province")
plt.title("Percentage of Clients by Province")
plt.xlim(0, max(members_percentage) + 5) 
plt.grid(axis="x", linestyle="--", alpha=0.5)



📉 **Why do we use this chart?**  
The variable 'salary' is displayed in a violin plot, which provides a visual representation of the distribution of salaries across each province. The wider the graph points in an area, the higher the concentration of individuals earning that particular salary in that province..  

🔍  **What does the chart tell us?**

It is interesting to analyze the salary range according to the population of each province. Ontario, Alberta, and BC have the widest salary distribution. These provinces have a diverse customer base due to their large populations and strong economies.  
Manitoba and New Brunswick have smaller economies, resulting in less salary dispersion.  
Yukon, however, shows a broad salary distribution due to its diversified economy, geographical location, and small population.  

In [None]:
plt.figure(figsize=(12, 10))
sns.violinplot(y="province", x="salary", data=df_loyalty, palette="light:#95A_r", scale="width")
plt.xlabel("Salary distribution")
plt.ylabel("Province")
plt.title("Salary distribution by province")
plt.grid(True, alpha=0.5)
plt.show()

In [None]:
from IPython.display import display, Markdown, Image

display(Markdown("### Canada Data"))
display(Image("Imagen/2025-03-10.png", width=400, height=300))

🔚  **Summing-up**

Most customers come from three key provinces: Ontario, BC, and Quebec, which together account for 78.3% of the customer base. These regions also exhibit a broader salary distribution, indicating higher purchasing power.  
The fact that the loyalty program's penetration is not proportional to each province’s population suggests opportunities for expansion and the need for province-specific strategies. 


💬  ***Strategy for the Company***

**Expand the loyalty program’s presence** in Ontario, Quebec, and Alberta by offering exclusive benefits at airports or launching targeted communication campaigns in key sectors like business and tourism.  
**Leverage BC’s success** and strengthen customer loyalty. Analyze what has worked well in BC to determine if similar strategies can be implemented in other provinces.  
**Increase representation in underperforming provinces** by introducing personalized incentives such as regional flight discounts, new route development, or collaborations with local governments.  

**----------------------------------------------**

### ❓ Question 4: Compare the average salary by education level

In [None]:
salary_by_education = df_loyalty.groupby("education")["salary"].mean().round(2).sort_values()
print(salary_by_education)


📊 **Why do we use this chart?**  
A bar chart helps us clearly compare the average salary for each education level.  

🔍  **What does the chart tell us?**

The chart indicates that, in general, a higher level of education corresponds to a higher average salary. Additionally, the difference between intermediate levels (bachelor's and college) is smaller compared to the gap between a master's and a doctorate.

One important detail is that, in Canada, college and bachelor's refer to two different levels of post-secondary education. College typically includes technical education programs (similar to vocational training or associate degrees in other countries, such as diplomatura or FP in Spain), whereas a bachelor's degree corresponds to a full university degree.

Therefore, this difference in average salaries may be influenced by the technical nature of college programs, in contrast to the more academic training provided by bachelor's degrees.

In [None]:
df_salary = salary_by_education.reset_index() 
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=df_salary, x="education", y="salary", color="#6C8E68", edgecolor="#8e686e", linewidth=5)
ax.bar_label(ax.containers[0], fmt="%.2f", fontsize=10, color="black", padding=5)
plt.xlabel("Education")
plt.ylabel("Average Salary")
plt.title("A Comparison of Average Salaries by Educational Level")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.xticks(rotation=45)


📈 **Why do we use this chart?**  
A boxplot also allows us to see the full distribution of salaries within each education level, as well as outliers.  

🔍  **What does the chart tell us?**

The salary range for college is more uniform, indicating that salaries tend to be consistent across the board with minimal variance.

In contrast, the Bachelor's degree salary range exhibits significant dispersion, likely attributable to the wide range of salaries achievable with university qualifications. These outliers suggest that, in certain cases, these professionals can attain remarkably high salaries.

The high school distribution is relatively concentrated, with minimal income dispersion. Some outliers may be attributed to extended career trajectories or entrepreneurial endeavours.

In [None]:

sns.boxplot(x=df_loyalty["education"], y=df_loyalty["salary"], 
            boxprops={"facecolor": "#6C8E68", "edgecolor": "#8E686E", "linewidth": 2},
            medianprops={"color": "#403033", "linewidth": 2},  
            whiskerprops={"color": "#8E686E", "linewidth": 2}, 
            capprops={"color": "#8E686E", "linewidth": 2},
            flierprops={"marker": "o", "color": "#8E686E", "alpha": 0.5})  
plt.xlabel("Educational Attainment")
plt.ylabel("Income")
plt.title("Income and Educational Attainment Structure")
plt.grid(True, alpha=0.5, linestyle="--")
plt.xticks(rotation=45)





In [None]:
#We are going to see the same graph only with the college data in order to see it properly.
df_college = df_loyalty[df_loyalty["education"] == "college"]
plt.figure(figsize=(6, 6))
sns.boxplot(y=df_college["salary"], 
            boxprops={"facecolor": "#6C8E68", "edgecolor": "#8E686E", "linewidth": 2},
            medianprops={"color": "#403033", "linewidth": 2},
            whiskerprops={"color": "#8E686E", "linewidth": 2},  # Bigotes
            capprops={"color": "#8E686E", "linewidth": 2},
            flierprops={"marker": "o", "markerfacecolor": "#8E686E", "alpha": 0.5})
plt.ylabel("Income")
plt.title("Income Distribution for College Education Level")
plt.grid(True, alpha=0.5, linestyle="--")

📊 **Why do we use this chart?**  
A bar chart is a useful tool for visualising wage data. By breaking down wages into specific ranges and displaying the number of customers in each range, it provides clear insights into the distribution of wages.

🔍  **What does the chart tell us?**

The salary range for college is more unifor

In [None]:
salary_bins = [0, 50000, 100000, 150000, 200000, 250000, 300000, 400000]
df_loyalty["salary_range"] = pd.cut(df_loyalty["salary"], bins=salary_bins)
plt.figure(figsize=(10,6))
df_loyalty["salary_range"].value_counts().sort_index().plot(kind="bar", color="#6C8E68", edgecolor="black", alpha=0.7)
plt.xlabel("Salary Range")
plt.ylabel("Number of Clients")
plt.title("Client Distribution by Salary Range")
plt.xticks(rotation=45)
plt.grid(axis="y", alpha=0.5, linestyle="--")
plt.show()


🔚 **Summing-up**

As a general guideline we can define that higher education opens the door to higher salaries, but only a small percentage of people reach extremely high levels. PhDs have the highest salary dispersion which can be interesting for premium strategies. It is very important to note that most of the loyalty programme base is focused on a single salary range.

💬  **Strategy for the Company**

It would be beneficial to create targeted promotions based on educational attainment and salary stability. 
For example, college and bachelor customers could be offered benefits such as points accumulation, promotions on domestic flights, or discounts on work routes. 
For customers with master's and doctoral degrees, we could define premium services, priority access, or other exclusive loyalty programmes.

Given that the majority of customers are in the mid-range, it is essential to focus efforts on benefits that are accessible to the majority of customers, as they are the ones who sustain the programme, with flexible fares and incentives for distance-based loyalty.

**--------------------------------------------------**

### ❓ Question 5: What is the percentage of clients for each type of loyalty card?


In [None]:
loyalty_counts = df_loyalty["loyalty_card"].value_counts(normalize=True) * 100
print(loyalty_counts)


📈 **Why do we use this chart?**  
A pie chart is useful because it shows the proportion of each type of card. It is very intuitive.  

🔍  **What does the chart tell us?**

The STAR card is the most widely used card, making it the most popular among its customer base, as illustrated by graph two. This suggests that it is the basic card, AURORA the intermediate card and NOVA the premium card.

As the entry-level category, the STAR card has the highest percentage of customers.NOVA has a fairly high proportion for the premium category, indicating that a significant number of customers value the exclusive benefits.

The striking fact is the percentage of AURORA customers, since if it is the transition category it should have more customers than NOVA. This may be because many customers jump from Star to Nova without going through AURORA.


In [None]:
plt.figure(figsize=(7, 5))
colors = ["#dbb770", "#db7094", "#db8270", "gold"]
wedges, texts, autotexts = plt.pie(loyalty_counts, labels=loyalty_counts.index, autopct="%1.1f%%", colors=colors, startangle=140, wedgeprops={"edgecolor": "#f1debd", "linewidth": 2.5})
for text, color in zip(texts, colors):
    text.set_text(text.get_text().upper()) 
    text.set_color(color) 
    text.set_fontsize(12) 
plt.title("Customer Distribution by Type of Loyalty Card", pad=30)
plt.axis("equal")

🔚 **Summing-up**

The benefits of STAR and NOVA should be analysed and compared with those of AURORA to see if adjustments are needed.

💬  **Strategy for the Company**

In order to encourage customers to upgrade from STAR to AURORA and then to NOVA, it would be advisable to implement incentives such as welcome bonuses, flight discounts or temporary promotions.

It is also recommended that strategies are customised according to customer profile, using data to identify who stays on STAR and who upgrades fast, and sending them personalised offers based on their flight history and spending patterns.

**------------------------------------------------**

### ❓ Question 6: How are clients distributed according to marital status and gender?

In [None]:

marital_gender_counts = df_loyalty.groupby(["marital_status", "gender"]).size().unstack()
print(marital_gender_counts)


📊 **Why do we use this chart?**  
A bar plot clearly shows how many clients belong to each marital status and their gender distribution.  

🔍  **What does the chart tell us?**


In [None]:
plt.figure(figsize=(6, 4))
marital_gender_counts.plot(kind="bar", figsize=(10, 6), color=["#c7c4ff", "#c4dfff"], edgecolor="#ffc7c4")
plt.xlabel("Marital Status")
plt.ylabel("Number of clients")
plt.title("Distribution of Clients by Marital Status and Gender")
plt.xticks(rotation=45)
plt.legend(title="Gender")
plt.grid(axis="y", linestyle="--", alpha=0.7)


📉 **Why do we use this chart?**  
If we remove the gender variable, a pie chart quickly shows the distribution of clients by marital status only.  

🔍  **What does the chart tell us?**

In [None]:
marital_counts = df_loyalty["marital_status"].value_counts()
plt.figure(figsize=(6, 4))
explode = [0.05] * len(marital_counts)
wedges, texts, autotexts = plt.pie(marital_counts, labels=marital_counts.index, autopct="%1.1f%%", colors=plt.cm.Pastel2.colors, startangle=140, explode=explode)
for text, color in zip(texts, colors):
    text.set_text(text.get_text().upper()) 
    text.set_color(color) 
    text.set_fontsize(12) 
plt.title("Customers by Marital Status", pad=30)
plt.axis("equal")

📈 **Why do we use this chart?**  
By integrating a numerical variable like `salary`, we can use a violin plot to visualize the salary distribution within each marital status. 
This helps analyze whether the loyalty program presents salary inequalities between men and women within each marital category.  

🔍  **What does the chart tell us?**

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x="marital_status", y="salary", hue="gender", data=df_loyalty, palette=["#c7c4ff", "#c4dfff"], scale="count")
plt.xlabel("Marital Status")
plt.ylabel("Salary Distribution")
plt.title("Salary distribution by marital status and gender")
plt.show()

🔚 **Summing-up**

💬  **Strategy for the Company**

**-------------------------------------------------**