In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Importing Necessary Libraries**

In [15]:
import pandas as pd
import numpy as np
import datetime
import math

### **Loading Given Data**

In [4]:
data = pd.read_csv("/content/drive/MyDrive/esewa_airlines.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,account_id,last_modified_date,travelled_flight_name,total_amount
0,0,1.55947e+19,10/11/2023,BUDDHA AIR,6400.0
1,1,2.69308e+18,10/11/2023,BUDDHA AIR,5200.0
2,2,7.49454e+18,10/11/2023,SHREE AIRLINES,8800.0
3,3,1.49332e+19,10/11/2023,BUDDHA AIR,9001.0
4,4,1.68207e+19,10/11/2023,YETI AIRLINES,23200.0


In [6]:
data.columns

Index(['Unnamed: 0', 'account_id', 'last_modified_date',
       'travelled_flight_name', 'total_amount'],
      dtype='object')

In [7]:
# Removing Extra Columns Came Due To Import
del data['Unnamed: 0']

In [8]:
# Checking columns datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   account_id             1048575 non-null  float64
 1   last_modified_date     1048575 non-null  object 
 2   travelled_flight_name  1048575 non-null  object 
 3   total_amount           1048575 non-null  float64
dtypes: float64(2), object(2)
memory usage: 32.0+ MB


### **Converting Date of type object to datetime**

In [10]:
data.last_modified_date = pd.to_datetime(data.last_modified_date,format='%m/%d/%Y')

In [11]:
# Analyzing top 5 data of dataframe
data.head()

Unnamed: 0,account_id,last_modified_date,travelled_flight_name,total_amount
0,1.55947e+19,2023-10-11,BUDDHA AIR,6400.0
1,2.69308e+18,2023-10-11,BUDDHA AIR,5200.0
2,7.49454e+18,2023-10-11,SHREE AIRLINES,8800.0
3,1.49332e+19,2023-10-11,BUDDHA AIR,9001.0
4,1.68207e+19,2023-10-11,YETI AIRLINES,23200.0


In [12]:
'''
Checking the unique rows in each columns
'''
data.nunique()

account_id               211509
last_modified_date          479
travelled_flight_name         9
total_amount               4690
dtype: int64

### **Recency**

In RFM (Recency, Frequency, Monetary) analysis, **"Recency"** holds significant importance. It encapsulates the essence of how recently a customer has interacted with a product or service.

- *Recency* essentially measures the freshness of engagement, portraying the time elapsed since the last purchase or interaction. It's a pivotal metric in understanding customer behavior and preferences.
---




In [13]:
'''
Grouping Data by column "account_id" in "last_modified_date"
to get latest login date from each "account_id"
'''
data2 = data.groupby('account_id')['last_modified_date'].max().reset_index()

In [14]:
data2.head()

Unnamed: 0,account_id,last_modified_date
0,6630750000000.0,2024-04-22
1,9564960000000.0,2024-03-21
2,25648600000000.0,2023-02-15
3,67858600000000.0,2024-04-14
4,74415500000000.0,2023-05-08


In [16]:
'''
Finally the last login date is subtracted from the todays date and
result is stored in "regency columns" in days format
'''
data2['regency']  = data2['last_modified_date'].apply(lambda x : math.floor((datetime.datetime.now()-x).days))

In [17]:
data2.head()

Unnamed: 0,account_id,last_modified_date,regency
0,6630750000000.0,2024-04-22,17
1,9564960000000.0,2024-03-21,49
2,25648600000000.0,2023-02-15,449
3,67858600000000.0,2024-04-14,25
4,74415500000000.0,2023-05-08,367


### **Frequency**

**"Frequency"** stands as a pillar of customer engagement assessment. It encapsulates the rhythm of customer interactions with a product or service.

- *Frequency* serves as a beacon illuminating the consistency and regularity of customer engagement. It sheds light on how often a customer interacts, purchases, or engages with the business over a specific period.
---

In [18]:
'''
Grouping Data by column "account_id" in "last_modified_date"
to get count of login from each "account_id"
'''
data_frequency = data.groupby('account_id')['last_modified_date'].count().reset_index()

In [19]:
data_frequency.head()

Unnamed: 0,account_id,last_modified_date
0,6630750000000.0,2
1,9564960000000.0,1
2,25648600000000.0,2
3,67858600000000.0,1
4,74415500000000.0,1


In [20]:
data_frequency.columns = ["account_number","frequency"]

In [21]:
data_frequency.describe()

Unnamed: 0,account_number,frequency
count,211509.0,211509.0
mean,7.872588e+18,4.95759
std,5.115411e+18,10.52478
min,6630750000000.0,1.0
25%,3.58148e+18,1.0
50%,7.26788e+18,2.0
75%,1.17356e+19,6.0
max,1.84467e+19,1595.0


In [24]:
'''
Showing the account_number whose frequency of login is greater than 80
'''
data_freq_80 = data_frequency[data_frequency['frequency']>80]

In [25]:
data_freq_80.shape

(332, 2)

### **Monetary**

 **"Monetary"** emerges as a cornerstone in understanding customer worth. It embodies the financial dimension of customer engagement, quantifying the value each interaction brings to the business.

- *Monetary* delves into the economic aspect of customer behavior, portraying the monetary significance of each transaction or engagement. It not only considers the amount spent but also discerns patterns of spending behavior, purchase preferences, and potential for future revenue generation.
---

In [26]:
'''
Grouping Data by column "account_id" in "total_amount"
to get total transaction done by "account_id"
'''
data_monetary = data.groupby('account_id')['total_amount'].sum().reset_index()

In [27]:
data_monetary.head()

Unnamed: 0,account_id,total_amount
0,6630750000000.0,8200.0
1,9564960000000.0,4400.0
2,25648600000000.0,7790.0
3,67858600000000.0,7900.0
4,74415500000000.0,7400.0


### **Creating Final DataFrame with all RFM values**

In [28]:
data2['frequency'] = data_frequency['frequency']
data2['Monetary'] = data_monetary['total_amount']

In [29]:
data2.head()

Unnamed: 0,account_id,last_modified_date,regency,frequency,Monetary
0,6630750000000.0,2024-04-22,17,2,8200.0
1,9564960000000.0,2024-03-21,49,1,4400.0
2,25648600000000.0,2023-02-15,449,2,7790.0
3,67858600000000.0,2024-04-14,25,1,7900.0
4,74415500000000.0,2023-05-08,367,1,7400.0


### **Finding Valuable Customer From the Given Data**

In [30]:
'''
Getting Customer Who Used Esewa to purchase ticket recently and has used esewa most of the time to purchase tickets.

Note: Threshold assumption is random for both 'regency' and 'frequency'
'''
data_valuable_customer = data2[(data2['regency']<16) & (data2['frequency']>100)]

In [31]:
data_valuable_customer

Unnamed: 0,account_id,last_modified_date,regency,frequency,Monetary
1875,1.213950e+17,2024-04-25,14,101,692884.64
5820,3.700730e+17,2024-05-02,7,104,828376.42
6634,4.193420e+17,2024-05-04,5,122,976079.51
8932,5.723650e+17,2024-05-06,3,1595,11034386.67
10838,6.974150e+17,2024-05-01,8,175,1242944.38
...,...,...,...,...,...
207022,1.787820e+19,2024-04-25,14,110,800552.84
208138,1.801840e+19,2024-05-05,4,105,699211.66
208322,1.804170e+19,2024-05-06,3,114,923748.08
210128,1.827050e+19,2024-05-05,4,338,2837596.70


In [None]:
'''
Getting Most Valuable Customer
'''
data2.iloc[8932:8933]

Unnamed: 0,account_id,last_modified_date,regency,frequency,Monetary
8932,5.72365e+17,2024-05-06,2,1595,11034386.67


In [None]:
data['travelled_flight_name'].value_counts()

travelled_flight_name
BUDDHA AIR                        629618
SHREE AIRLINES                    200980
YETI AIRLINES                     170943
SAURYA AIRLINES                    42472
Summit Air                          1917
GUNA AIRLINES                       1388
Nepal Airlines-Domestic Direct       867
TARA AIR                             382
Tara Air                               8
Name: count, dtype: int64