In [1]:
# Libraries 
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
# import sys
# import os
# sys.path.append(os.path.abspath(os.path.join('..', 'src'))) 


# Configuration
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)


### Phase 1: Exploration and Cleanup

In [2]:
# Pseudo code
# Exploratory Data Analysis
    # create dataframes DONE
    # at dataframe level 
        # print df name DONE
        # visual exploration with .head(), .tail() and sample() DONE
        # dataframe overview with .info() DONE
        # check duplicates with .duplicated().sum DONE
            # create separate dataframe with duplicates if needed DONE
        # check null values .isnull().sum() DONE
        # explore main statistical metrics for numerical and non numerical columns with .describe() and .describe(include = "object")
    # at column level
        # review columns names with .columns DONE
        # review unique values with .unique() DONE
        # review value count with .value_counts() DONE
        # check null values with .isnull().sum() DONE
        # atypical values
    # create function/s for the EDA
# Transformation
# Dataframe union
# Save data into a csv file


In [3]:
# 📂 Open data 

df_customer_flights = pd.read_csv("data/customer_flight_activity.csv")
df_customer_loyalty = pd.read_csv("data/customer_loyalty_history.csv")
dataframes_dict = {"df_customer_flight✈️" : df_customer_flights, "df_customer_loyalty💞" : df_customer_loyalty}

In [4]:
# ✍️ Exploratory Data Analysis functions definition

def explore_dataframes (df_dict):
    """
    Provides relevant information for a Exploratory Data Analysis.
    
    This function receives a dict with dataframes and executes a series of 
    functions aiming to provide relevant information to analize the data.

    Args:

    Returns:
    
    """
    duplicate_dataframes_dict = {}
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        print(f" \n\n----------- DATAFRAME NAME: {df_name} -----------")
        print(f"\n{df_name} ---> Dataframe INFO:\n")
        display(df.info())
        print(f"\n{df_name} ---> FIRST FIVE (5) ROWS:")
        display(df.head())
        print(f"\n{df_name} ---> LAST FIVE (5) ROWS:")
        display(df.tail())
        print(f"\n{df_name} ---> SAMPLE (5) ROWS:")
        display(df.sample(5))
        print(f" \n{df_name} ---> DUPLICATES COUNT IS: {df.duplicated().sum()}, {round((df.duplicated().sum()/df.shape[0]*100),2)}% OVER TOTAL ROWS\n")
        if df.duplicated().sum() > 0:
            duplicates_df_name = name + "_duplicates"
            duplicates_df = df[df.duplicated(keep=False)]
            print(f"{df_name} ---> DATAFRAME WITH DUPLICATED ROWS (INCLUDING ALL APPEARANCES):\n")
            display(duplicates_df.head(10))
            duplicate_dataframes_dict[duplicates_df_name] = duplicates_df
        print(f"\n{df_name} --> COUNT OF ROWS WITH ALL NULL VALUES IS: {df.isnull().all(axis=1).sum()}\n")
        print(f"\n{df_name} --> COUNT OF COLUMNS WITH ALL NULL VALUES IS: {df.isnull().all().sum()}\n")
        print(F"\n{df_name} --> STATISTICAL METRICS FOR NUMERICAL COLUMNS:")
        display(df.describe().T)
        try:
            print(F"\n{df_name} --> STATISTICAL METRICS FOR CATEGORICAL COLUMNS:")
            display(df.describe(include="object").T)
        except:
            print("\nUPS... IT SEEMS LIKE THERE ARE NO COLUMNS WITH CATEGORICATL DATA")
    return duplicate_dataframes_dict

def explore_columns (df_dict):
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        print(f" \n\n----------- DATAFRAME NAME: {df_name} -----------")
        for index, column in enumerate(df.columns):
            print (f"\n{index}) Column {column.upper()} (from {(df_name)} dataframe):")
            print (f"\n>>> UNIQUE VALUES:")
            print (df[column].unique())
            print (f"\n>>> VALUES COUNT:")
            print (df[column].value_counts())
            print (f"\n>>> COUNT OF DUPLICATES IN THE COLUMN:")
            print (df.duplicated(subset=[column]).sum())
            print (f"\n>>> COUNT OF NULL VALUES IN THE COLUMN:")
            print (df[column].isnull().sum())
            if df[column].dtype in ['int64', 'float64']:
                print("\nSTATISTICAL DESCRIPTION (NUMERIC):")
                display(df[column].describe())
            else:
                print("\nSTATISTICAL DESCRIPTION (CATEGORICAL):")
                display(df[column].describe(include='object'))
            print ("--------")

In [5]:
# ▶️ Exploratory Data Analysis code execution

explore_dataframes(dataframes_dict)

explore_columns(dataframes_dict)


 

----------- DATAFRAME NAME: 'DF CUSTOMER FLIGHT✈️' -----------

'DF CUSTOMER FLIGHT✈️' ---> Dataframe INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405624 entries, 0 to 405623
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Loyalty Number               405624 non-null  int64  
 1   Year                         405624 non-null  int64  
 2   Month                        405624 non-null  int64  
 3   Flights Booked               405624 non-null  int64  
 4   Flights with Companions      405624 non-null  int64  
 5   Total Flights                405624 non-null  int64  
 6   Distance                     405624 non-null  int64  
 7   Points Accumulated           405624 non-null  float64
 8   Points Redeemed              405624 non-null  int64  
 9   Dollar Cost Points Redeemed  405624 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 30.9 MB


None


'DF CUSTOMER FLIGHT✈️' ---> FIRST FIVE (5) ROWS:


Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0



'DF CUSTOMER FLIGHT✈️' ---> LAST FIVE (5) ROWS:


Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405619,999902,2018,12,0,0,0,0,0.0,0,0
405620,999911,2018,12,0,0,0,0,0.0,0,0
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0
405623,999986,2018,12,0,0,0,0,0.0,0,0



'DF CUSTOMER FLIGHT✈️' ---> SAMPLE (5) ROWS:


Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
124792,445496,2017,8,0,0,0,0,0.0,0,0
309869,403282,2018,7,12,5,17,3060,306.0,0,0
77393,619537,2017,5,0,0,0,0,0.0,0,0
58399,509965,2017,4,0,0,0,0,0.0,0,0
137344,213086,2017,9,0,0,0,0,0.0,0,0


 
'DF CUSTOMER FLIGHT✈️' ---> DUPLICATES COUNT IS: 1864, 0.46% OVER TOTAL ROWS

'DF CUSTOMER FLIGHT✈️' ---> DATAFRAME WITH DUPLICATED ROWS (INCLUDING ALL APPEARANCES):



Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
41,101902,2017,1,0,0,0,0,0.0,0,0
42,101902,2017,1,0,0,0,0,0.0,0,0
226,112142,2017,1,0,0,0,0,0.0,0,0
227,112142,2017,1,0,0,0,0,0.0,0,0
477,126100,2017,1,0,0,0,0,0.0,0,0
478,126100,2017,1,0,0,0,0,0.0,0,0
566,130331,2017,1,0,0,0,0,0.0,0,0
567,130331,2017,1,0,0,0,0,0.0,0,0
659,135421,2017,1,0,0,0,0,0.0,0,0
660,135421,2017,1,0,0,0,0,0.0,0,0



'DF CUSTOMER FLIGHT✈️' --> COUNT OF ROWS WITH ALL NULL VALUES IS: 0


'DF CUSTOMER FLIGHT✈️' --> COUNT OF COLUMNS WITH ALL NULL VALUES IS: 0


'DF CUSTOMER FLIGHT✈️' --> STATISTICAL METRICS FOR NUMERICAL COLUMNS:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,405624.0,550037.873084,258935.286969,100018.0,326961.0,550834.0,772194.0,999986.0
Year,405624.0,2017.5,0.500001,2017.0,2017.0,2017.5,2018.0,2018.0
Month,405624.0,6.5,3.452057,1.0,3.75,6.5,9.25,12.0
Flights Booked,405624.0,4.115052,5.225518,0.0,0.0,1.0,8.0,21.0
Flights with Companions,405624.0,1.031805,2.076869,0.0,0.0,0.0,1.0,11.0
Total Flights,405624.0,5.146858,6.521227,0.0,0.0,1.0,10.0,32.0
Distance,405624.0,1208.880059,1433.15532,0.0,0.0,488.0,2336.0,6293.0
Points Accumulated,405624.0,123.692721,146.599831,0.0,0.0,50.0,239.0,676.5
Points Redeemed,405624.0,30.696872,125.486049,0.0,0.0,0.0,0.0,876.0
Dollar Cost Points Redeemed,405624.0,2.484503,10.150038,0.0,0.0,0.0,0.0,71.0



'DF CUSTOMER FLIGHT✈️' --> STATISTICAL METRICS FOR CATEGORICAL COLUMNS:

UPS... IT SEEMS LIKE THERE ARE NO COLUMNS WITH CATEGORICATL DATA
 

----------- DATAFRAME NAME: 'DF CUSTOMER LOYALTY💞' -----------

'DF CUSTOMER LOYALTY💞' ---> Dataframe INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loyalty Number      16737 non-null  int64  
 1   Country             16737 non-null  object 
 2   Province            16737 non-null  object 
 3   City                16737 non-null  object 
 4   Postal Code         16737 non-null  object 
 5   Gender              16737 non-null  object 
 6   Education           16737 non-null  object 
 7   Salary              12499 non-null  float64
 8   Marital Status      16737 non-null  object 
 9   Loyalty Card        16737 non-null  object 
 10  CLV                 16737 non-null  float64
 11

None


'DF CUSTOMER LOYALTY💞' ---> FIRST FIVE (5) ROWS:


Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
3,608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
4,530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,



'DF CUSTOMER LOYALTY💞' ---> LAST FIVE (5) ROWS:


Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
16732,823768,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Married,Star,61850.19,Standard,2012,12,,
16733,680886,Canada,Saskatchewan,Regina,S1J 3C5,Female,Bachelor,89210.0,Married,Star,67907.27,Standard,2014,9,,
16734,776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
16735,906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
16736,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0



'DF CUSTOMER LOYALTY💞' ---> SAMPLE (5) ROWS:


Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
10913,643705,Canada,British Columbia,Dawson Creek,U5I 4F1,Female,Bachelor,97363.0,Single,Star,2530.71,Standard,2015,2,,
10718,530517,Canada,Quebec,Montreal,H2Y 2W2,Female,Bachelor,67467.0,Single,Star,2488.24,Standard,2017,5,,
11625,801832,Canada,British Columbia,Vancouver,V5R 1W3,Female,College,,Married,Star,2693.76,Standard,2018,10,,
4599,744135,Canada,British Columbia,Vancouver,V6E 3D9,Female,Bachelor,85944.0,Married,Nova,2935.99,Standard,2016,12,,
10883,421744,Canada,Quebec,Hull,J8Y 3Z5,Female,College,,Married,Star,2522.26,Standard,2015,10,,


 
'DF CUSTOMER LOYALTY💞' ---> DUPLICATES COUNT IS: 0, 0.0% OVER TOTAL ROWS


'DF CUSTOMER LOYALTY💞' --> COUNT OF ROWS WITH ALL NULL VALUES IS: 0


'DF CUSTOMER LOYALTY💞' --> COUNT OF COLUMNS WITH ALL NULL VALUES IS: 0


'DF CUSTOMER LOYALTY💞' --> STATISTICAL METRICS FOR NUMERICAL COLUMNS:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,16737.0,549735.880445,258912.132453,100018.0,326603.0,550434.0,772019.0,999986.0
Salary,12499.0,79245.609409,35008.297285,-58486.0,59246.5,73455.0,88517.5,407228.0
CLV,16737.0,7988.896536,6860.98228,1898.01,3980.84,5780.18,8940.58,83325.38
Enrollment Year,16737.0,2015.253211,1.979111,2012.0,2014.0,2015.0,2017.0,2018.0
Enrollment Month,16737.0,6.669116,3.398958,1.0,4.0,7.0,10.0,12.0
Cancellation Year,2067.0,2016.503145,1.380743,2013.0,2016.0,2017.0,2018.0,2018.0
Cancellation Month,2067.0,6.962748,3.455297,1.0,4.0,7.0,10.0,12.0



'DF CUSTOMER LOYALTY💞' --> STATISTICAL METRICS FOR CATEGORICAL COLUMNS:


Unnamed: 0,count,unique,top,freq
Country,16737,1,Canada,16737
Province,16737,11,Ontario,5404
City,16737,29,Toronto,3351
Postal Code,16737,55,V6E 3D9,911
Gender,16737,2,Female,8410
Education,16737,5,Bachelor,10475
Marital Status,16737,3,Married,9735
Loyalty Card,16737,3,Star,7637
Enrollment Type,16737,2,Standard,15766


 

----------- DATAFRAME NAME: 'DF CUSTOMER FLIGHT✈️' -----------

0) Column LOYALTY NUMBER (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[100018 100102 100140 ... 999731 999788 999891]

>>> VALUES COUNT:
678205    72
989528    48
373638    48
684889    48
684881    48
          ..
428526    24
428536    24
428565    24
428590    24
999891    24
Name: Loyalty Number, Length: 16737, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
388887

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean     550037.873084
std      258935.286969
min      100018.000000
25%      326961.000000
50%      550834.000000
75%      772194.000000
max      999986.000000
Name: Loyalty Number, dtype: float64

--------

1) Column YEAR (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[2017 2018]

>>> VALUES COUNT:
2017    202812
2018    202812
Name: Year, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
405622

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean       2017.500000
std           0.500001
min        2017.000000
25%        2017.000000
50%        2017.500000
75%        2018.000000
max        2018.000000
Name: Year, dtype: float64

--------

2) Column MONTH (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[ 1  9  2  3 11  4  5  7  6  8 10 12]

>>> VALUES COUNT:
1     33802
9     33802
2     33802
3     33802
11    33802
4     33802
5     33802
7     33802
6     33802
8     33802
10    33802
12    33802
Name: Month, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
405612

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean          6.500000
std           3.452057
min           1.000000
25%           3.750000
50%           6.500000
75%           9.250000
max          12.000000
Name: Month, dtype: float64

--------

3) Column FLIGHTS BOOKED (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[ 3 10  6  0  8 11  9  4  7  5  2  1 12 13 14 16 15 17 18 19 20 21]

>>> VALUES COUNT:
0     197992
3      18228
11     15705
5      15084
7      14561
8      14357
9      14164
6      13847
2      13641
10     13363
4      12821
1      12667
13     10782
12     10382
14      7057
15      6582
16      4354
17      3899
18      2958
19      1496
20      1156
21       528
Name: Flights Booked, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
405602

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean          4.115052
std           5.225518
min           0.000000
25%           0.000000
50%           1.000000
75%           8.000000
max          21.000000
Name: Flights Booked, dtype: float64

--------

4) Column FLIGHTS WITH COMPANIONS (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[ 0  4  7  1  6  3  5  2 10  8  9 11]

>>> VALUES COUNT:
0     296887
2      19272
3      19015
1      17905
4      13665
5      13424
6       9911
7       7089
8       3965
9       2944
10      1071
11       476
Name: Flights with Companions, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
405612

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean          1.031805
std           2.076869
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          11.000000
Name: Flights with Companions, dtype: float64

--------

5) Column TOTAL FLIGHTS (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[ 3 14  6  0 15 11 12 10  8  9  7  5 16  2  1 17 13 22  4 19 18 21 26 20
 23 25 27 24 28 30 29 31 32]

>>> VALUES COUNT:
0     197992
6      15273
10     14393
8      14056
4      13656
2      13032
12     12409
11     12300
9      10688
13     10499
14     10310
7       9873
3       9783
5       9309
15      8544
16      7727
17      6198
1       6106
18      5757
19      4002
20      3417
21      2610
22      1981
23      1616
24      1209
25       900
26       722
27       504
28       306
29       214
30       150
31        61
32        27
Name: Total Flights, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
405591

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean          5.146858
std           6.521227
min           0.000000
25%           0.000000
50%           1.000000
75%          10.000000
max          32.000000
Name: Total Flights, dtype: float64

--------

6) Column DISTANCE (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[1521 2030 1200 ... 1217  617 4135]

>>> VALUES COUNT:
0       197992
2520       410
2880       401
1680       389
2160       365
         ...  
1333         1
3722         1
3985         1
4779         1
4135         1
Name: Distance, Length: 4746, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
400878

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean       1208.880059
std        1433.155320
min           0.000000
25%           0.000000
50%         488.000000
75%        2336.000000
max        6293.000000
Name: Distance, dtype: float64

--------

7) Column POINTS ACCUMULATED (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[152.   203.   120.   ...  18.75 601.   626.  ]

>>> VALUES COUNT:
0.00      197992
180.00       763
270.00       734
288.00       717
189.00       709
           ...  
658.50         1
556.25         1
10.80          1
565.50         1
626.00         1
Name: Points Accumulated, Length: 1549, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
404075

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    405624.000000
mean        123.692721
std         146.599831
min           0.000000
25%           0.000000
50%          50.000000
75%         239.000000
max         676.500000
Name: Points Accumulated, dtype: float64

--------

8) Column POINTS REDEEMED (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[  0 341 364 310 445 312 343 366 389 292 447 324 456 409 436 327 322 291
 323 300 290 309 325 386 321 363 340 670 443 517 444 328 344 367 313 333
 293 449 297 455 372 356 405 381 466 419 369 352 482 335 329 305 415 396
 317 348 314 334 350 330 318 298 420 336 471 680 441 353 484 301 374 417
 501 299 398 307 368 306 347 439 395 481 337 382 426 373 399 424 326 392
 438 467 480 448 308 400 376 375 460 339 385 611 431 320 362 404 442 410
 361 319 435 414 464 477 315 485 370 421 349 371 416 496 510 667 465 434
 346 487 408 500 360 378 345 358 479 380 411 491 505 446 425 476 393 418
 332 401 454 303 594 506 355 302 403 379 437 561 483 597 391 562 342 407
 490 468 488 457 365 357 463 388 413 351 462 440 493 507 338 377 428 525
 390 473 359 423 519 453 522 429 450 383 469 402 354 422 458 384 656 461
 427 478 523 719 538 515 394 472 433 387 540 530 498 512 452 311 470 541
 331 570 527 658 494 638 48

count    405624.000000
mean         30.696872
std         125.486049
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         876.000000
Name: Points Redeemed, dtype: float64

--------

9) Column DOLLAR COST POINTS REDEEMED (from 'DF CUSTOMER FLIGHT✈️' dataframe):

>>> UNIQUE VALUES:
[ 0 28 30 25 36 32 24 26 37 33 35 27 31 54 42 29 38 34 39 55 41 49 40 48
 45 53 58 44 43 46 52 47 63 57 62 51 50 64 56 61 65 60 68 59 66 69 67 71
 70]

>>> VALUES COUNT:
0     381443
36      1037
38      1001
40       993
42       983
39       943
44       923
41       913
37       887
43       877
35       852
34       844
32       817
33       800
45       779
30       772
46       753
31       722
47       675
48       638
28       598
29       596
50       595
49       568
52       482
51       435
27       421
54       402
26       379
53       377
56       365
55       343
57       274
58       265
60       260
59       219
25       209
61       175
63       175
62       155
64       152
24       110
66       106
65        90
68        85
67        68
70        38
69        28
71         2
Name: Dollar Cost Points Redeemed, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLU

count    405624.000000
mean          2.484503
std          10.150038
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          71.000000
Name: Dollar Cost Points Redeemed, dtype: float64

--------
 

----------- DATAFRAME NAME: 'DF CUSTOMER LOYALTY💞' -----------

0) Column LOYALTY NUMBER (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[480934 549612 429460 ... 776187 906428 652627]

>>> VALUES COUNT:
480934    1
208485    1
455896    1
469781    1
533488    1
         ..
922188    1
780529    1
721438    1
761015    1
652627    1
Name: Loyalty Number, Length: 16737, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
0

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count     16737.000000
mean     549735.880445
std      258912.132453
min      100018.000000
25%      326603.000000
50%      550434.000000
75%      772019.000000
max      999986.000000
Name: Loyalty Number, dtype: float64

--------

1) Column COUNTRY (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Canada']

>>> VALUES COUNT:
Canada    16737
Name: Country, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16736

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count      16737
unique         1
top       Canada
freq       16737
Name: Country, dtype: object

--------

2) Column PROVINCE (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Ontario' 'Alberta' 'British Columbia' 'Quebec' 'Yukon' 'New Brunswick'
 'Manitoba' 'Nova Scotia' 'Saskatchewan' 'Newfoundland'
 'Prince Edward Island']

>>> VALUES COUNT:
Ontario                 5404
British Columbia        4409
Quebec                  3300
Alberta                  969
Manitoba                 658
New Brunswick            636
Nova Scotia              518
Saskatchewan             409
Newfoundland             258
Yukon                    110
Prince Edward Island      66
Name: Province, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16726

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count       16737
unique         11
top       Ontario
freq         5404
Name: Province, dtype: object

--------

3) Column CITY (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Toronto' 'Edmonton' 'Vancouver' 'Hull' 'Whitehorse' 'Trenton' 'Montreal'
 'Dawson Creek' 'Quebec City' 'Fredericton' 'Ottawa' 'Tremblant' 'Calgary'
 'Thunder Bay' 'Whistler' 'Peace River' 'Winnipeg' 'Sudbury'
 'West Vancouver' 'Halifax' 'London' 'Regina' 'Kelowna' "St. John's"
 'Victoria' 'Kingston' 'Banff' 'Moncton' 'Charlottetown']

>>> VALUES COUNT:
Toronto           3351
Vancouver         2582
Montreal          2059
Winnipeg           658
Whistler           582
Halifax            518
Ottawa             509
Trenton            486
Edmonton           486
Quebec City        485
Dawson Creek       444
Fredericton        425
Regina             409
Kingston           401
Tremblant          398
Victoria           389
Hull               358
West Vancouver     324
St. John's         258
Thunder Bay        256
Sudbury            227
Moncton            211
Calgary            191
Banff              179
Londo

count       16737
unique         29
top       Toronto
freq         3351
Name: City, dtype: object

--------

4) Column POSTAL CODE (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['M2Z 4K1' 'T3G 6Y6' 'V6E 3D9' 'P1W 1K4' 'J8Y 3Z5' 'Y2K 6R0' 'P5S 6R4'
 'K8V 4B2' 'H2Y 2W2' 'M8Y 4K8' 'U5I 4F1' 'G1B 3L5' 'H4G 3T4' 'M2M 7K8'
 'M2M 6J7' 'E3B 2H2' 'M1R 4K3' 'T9G 1W3' 'H2Y 4R4' 'V5R 1W3' 'P1L 8X8'
 'K1F 2R2' 'H5Y 2S9' 'V1E 4R6' 'H2T 2J6' 'T3E 2V9' 'H2T 9K8' 'K8T 5M5'
 'V6T 1Y8' 'P2T 6G3' 'T9O 2W2' 'V6E 3Z3' 'R6Y 4T5' 'M5V 1G5' 'V6V 8Z3'
 'B3J 9S2' 'M5B 3E4' 'R2C 0M5' 'S6J 3G0' 'M2P 4F6' 'P1J 8T7' 'V09 2E9'
 'A1C 6H9' 'V10 6T5' 'B3C 2M8' 'M9K 2P4' 'T4V 1D4' 'R3R 3T4' 'S1J 3C5'
 'E1A 2A7' 'K1G 4Z0' 'H3T 8L4' 'C1A 6E8' 'H3J 5I6' 'M3R 4K8']

>>> VALUES COUNT:
V6E 3D9    911
V5R 1W3    684
V6T 1Y8    582
V6E 3Z3    544
M2M 7K8    534
P1J 8T7    500
H2T 9K8    499
K8V 4B2    486
G1B 3L5    485
H2T 2J6    446
U5I 4F1    444
V1E 4R6    443
E3B 2H2    425
R2C 0M5    415
M9K 2P4    401
H5Y 2S9    398
V10 6T5    389
K1F 2R2    389
H2Y 2W2    365
J8Y 3Z5    358
M8Y 4K8    340
H4G 3T4    338

count       16737
unique         55
top       V6E 3D9
freq          911
Name: Postal Code, dtype: object

--------

5) Column GENDER (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Female' 'Male']

>>> VALUES COUNT:
Female    8410
Male      8327
Name: Gender, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16735

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count      16737
unique         2
top       Female
freq        8410
Name: Gender, dtype: object

--------

6) Column EDUCATION (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Bachelor' 'College' 'Master' 'High School or Below' 'Doctor']

>>> VALUES COUNT:
Bachelor                10475
College                  4238
High School or Below      782
Doctor                    734
Master                    508
Name: Education, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16732

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count        16737
unique           5
top       Bachelor
freq         10475
Name: Education, dtype: object

--------

7) Column SALARY (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[ 83236.     nan 103495. ...  76178.  91970. -57297.]

>>> VALUES COUNT:
 101933.0    23
 62283.0     14
 61809.0     14
 51573.0     14
 64001.0     13
             ..
 95907.0      1
 72440.0      1
 88633.0      1
 100572.0     1
-57297.0      1
Name: Salary, Length: 5890, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
10846

>>> COUNT OF NULL VALUES IN THE COLUMN:
4238

STATISTICAL DESCRIPTION (NUMERIC):


count     12499.000000
mean      79245.609409
std       35008.297285
min      -58486.000000
25%       59246.500000
50%       73455.000000
75%       88517.500000
max      407228.000000
Name: Salary, dtype: float64

--------

8) Column MARITAL STATUS (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Married' 'Divorced' 'Single']

>>> VALUES COUNT:
Married     9735
Single      4484
Divorced    2518
Name: Marital Status, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16734

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count       16737
unique          3
top       Married
freq         9735
Name: Marital Status, dtype: object

--------

9) Column LOYALTY CARD (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Star' 'Aurora' 'Nova']

>>> VALUES COUNT:
Star      7637
Nova      5671
Aurora    3429
Name: Loyalty Card, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16734

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count     16737
unique        3
top        Star
freq       7637
Name: Loyalty Card, dtype: object

--------

10) Column CLV (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[ 3839.14  3839.61  3839.75 ... 44771.3  50568.26 61134.68]

>>> VALUES COUNT:
8564.77     13
13024.13    12
3808.12     12
2359.42     12
3731.50     12
            ..
16152.90     1
5204.65      1
5620.59      1
5205.02      1
61134.68     1
Name: CLV, Length: 7984, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
8753

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    16737.000000
mean      7988.896536
std       6860.982280
min       1898.010000
25%       3980.840000
50%       5780.180000
75%       8940.580000
max      83325.380000
Name: CLV, dtype: float64

--------

11) Column ENROLLMENT TYPE (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
['Standard' '2018 Promotion']

>>> VALUES COUNT:
Standard          15766
2018 Promotion      971
Name: Enrollment Type, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16735

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (CATEGORICAL):


count        16737
unique           2
top       Standard
freq         15766
Name: Enrollment Type, dtype: object

--------

12) Column ENROLLMENT YEAR (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[2016 2014 2013 2012 2015 2018 2017]

>>> VALUES COUNT:
2018    3010
2017    2487
2016    2456
2013    2397
2014    2370
2015    2331
2012    1686
Name: Enrollment Year, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16730

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    16737.000000
mean      2015.253211
std          1.979111
min       2012.000000
25%       2014.000000
50%       2015.000000
75%       2017.000000
max       2018.000000
Name: Enrollment Year, dtype: float64

--------

13) Column ENROLLMENT MONTH (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[ 2  3  7 10  5  6 12  1 11  8  4  9]

>>> VALUES COUNT:
5     1503
12    1480
7     1473
11    1446
10    1444
8     1430
6     1412
9     1391
4     1388
3     1358
2     1220
1     1192
Name: Enrollment Month, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16725

>>> COUNT OF NULL VALUES IN THE COLUMN:
0

STATISTICAL DESCRIPTION (NUMERIC):


count    16737.000000
mean         6.669116
std          3.398958
min          1.000000
25%          4.000000
50%          7.000000
75%         10.000000
max         12.000000
Name: Enrollment Month, dtype: float64

--------

14) Column CANCELLATION YEAR (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[  nan 2018. 2015. 2017. 2014. 2016. 2013.]

>>> VALUES COUNT:
2018.0    645
2017.0    506
2016.0    427
2015.0    265
2014.0    181
2013.0     43
Name: Cancellation Year, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16730

>>> COUNT OF NULL VALUES IN THE COLUMN:
14670

STATISTICAL DESCRIPTION (NUMERIC):


count    2067.000000
mean     2016.503145
std         1.380743
min      2013.000000
25%      2016.000000
50%      2017.000000
75%      2018.000000
max      2018.000000
Name: Cancellation Year, dtype: float64

--------

15) Column CANCELLATION MONTH (from 'DF CUSTOMER LOYALTY💞' dataframe):

>>> UNIQUE VALUES:
[nan  1. 12.  4.  2.  7. 11.  5.  6. 10.  8.  9.  3.]

>>> VALUES COUNT:
12.0    213
11.0    212
8.0     208
7.0     186
10.0    180
9.0     176
6.0     165
1.0     155
3.0     149
5.0     148
2.0     139
4.0     136
Name: Cancellation Month, dtype: int64

>>> COUNT OF DUPLICATES IN THE COLUMN:
16724

>>> COUNT OF NULL VALUES IN THE COLUMN:
14670

STATISTICAL DESCRIPTION (NUMERIC):


count    2067.000000
mean        6.962748
std         3.455297
min         1.000000
25%         4.000000
50%         7.000000
75%        10.000000
max        12.000000
Name: Cancellation Month, dtype: float64

--------


In [6]:
# ✍️ Data transformation (cleaning and union) functions definition

def columns_to_snake_case (df_dict):
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        print(f"\n{df_name} ---> Dataframe COLUMNS:\n")
        print(df.columns)

def impute_nulls_as_special_category(df, column_list, category_name):
    # Iterate through the list of columns to replace nulls with "Unknown"
    for column in column_list:
        if column in df.columns:
            # Replace nulls with the value "Unknown" for each column in the list
            df[column] = df[column].fillna(category_name)
            print (f"\nNull values imputed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def transform_negative_values(df,column_list):
    """Transform negative values into its absolute value"""
    for column in column_list:
        if column in df.columns:
            df[column] = df[column].abs()
            print (f"\nNegative values transformed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def impute_nulls_as_median(df, column_list):
    # Iterate through the list of columns to replace nulls with median
    for column in column_list:
        if column in df.columns:
            median= df[column].median()
            # Replace nulls with the median for each column in the list
            df[column] = df[column].fillna(median)
            print (f"\nNull values imputed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def dfs_left_union (df_left, df_right):
    print (f"\nDataframe on the left's SHAPE:{df_left.shape}")
    print (f"\nDataframe on the right's SHAPE:{df_right.shape}")
    df_final = df_customer_flights.merge(df_customer_loyalty, how='left', on='loyalty_number')
    print("\nDatafrems were joined succesfully ✅")
    print (f"\nDataframe Final's SHAPE:{df_final.shape}")
    print (f"\nDataframe Final's COLUMNS are:{df_final.columns}")
    return df_final

In [7]:
# ▶️ Data Transformation code execution

# Rename columns to snake case format
columns_to_snake_case(dataframes_dict)

# Delete records from df customer_flights that are duplicated in all columns, keeping only the first appearance
df_customer_flights.drop_duplicates(inplace=True)

print(f"DUPLICATES COUNT IS: {df_customer_flights.duplicated().sum()}")

# In columns "cancellation_year" and "cancelation_month" replace NaN with "Not Cancelled"
columns = ["cancellation_year","cancellation_month"]
category_name = "Not Cancelled"
impute_nulls_as_special_category(df_customer_loyalty,columns,category_name)

# In salary column transform negative value into positive and impute nulls assigning the median value
columns = ["salary"]
transform_negative_values(df_customer_loyalty,columns)
impute_nulls_as_median(df_customer_loyalty, columns)

# Join dataframes 
df_final = dfs_left_union(df_customer_flights,df_customer_loyalty)

# Save data into a csv
df_final.to_csv('data/customer_data_transformed.csv')


'DF CUSTOMER FLIGHT✈️' ---> Dataframe COLUMNS:

Index(['loyalty_number', 'year', 'month', 'flights_booked',
       'flights_with_companions', 'total_flights', 'distance',
       'points_accumulated', 'points_redeemed', 'dollar_cost_points_redeemed'],
      dtype='object')

'DF CUSTOMER LOYALTY💞' ---> Dataframe COLUMNS:

Index(['loyalty_number', 'country', 'province', 'city', 'postal_code',
       'gender', 'education', 'salary', 'marital_status', 'loyalty_card',
       'clv', 'enrollment_type', 'enrollment_year', 'enrollment_month',
       'cancellation_year', 'cancellation_month'],
      dtype='object')
DUPLICATES COUNT IS: 0

Null values imputed ✅ in column cancellation_year.
UNIQUE VALUES:
['Not Cancelled' 2018.0 2015.0 2017.0 2014.0 2016.0 2013.0]

Null values imputed ✅ in column cancellation_month.
UNIQUE VALUES:
['Not Cancelled' 1.0 12.0 4.0 2.0 7.0 11.0 5.0 6.0 10.0 8.0 9.0 3.0]

Negative values transformed ✅ in column salary.
UNIQUE VALUES:
[ 83236.     nan 103495. ...  76178.

# Loyalty number column analysis

In [10]:


# check the number of duplicates in loyalty_number column
duplicates = df_customer_flights.duplicated(subset=['loyalty_number']).sum()
print(f"There are {duplicates} duplicates in the column 'loyalty_number', which represent {round((duplicates/df_customer_flights.shape[0]*100),2)}% over the total")

df_loyalty_number_duplicates = df_customer_flights[df_customer_flights.duplicated(subset=['loyalty_number'])]
print (f"\n>>>DATAFRAME WITH loyalty_number:")
display(df_loyalty_number_duplicates)
# the information in the dataframe is organized in a way that there is one row per each year and each month within a year. 
# Since there is info from 2 years it is expected to have 24 rows per loyalty_number (or less). So, I will only focus the analysis in those that appear more than 24 times
# Calculate the times of appearance for each loyalty_number
loyalty_number_value_counts = df_customer_flights['loyalty_number'].value_counts()
print (f"\n>>> VALUES COUNT:")
print(loyalty_number_value_counts)
# filter the Series to keep only the values that appear more than 24 times
values_to_keep = loyalty_number_value_counts[loyalty_number_value_counts > 24].index
print (f"\n>>> VALUES WITH MORE THAN 24 ROWS:")
print(values_to_keep)

There are 387023 duplicates in the column 'loyalty_number', which represent 95.85% over the total

>>>DATAFRAME WITH loyalty_number:


Unnamed: 0,loyalty_number,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed
37,100504,2017,2,0,0,0,0,0.0,0,0
39,100504,2017,3,0,0,0,0,0.0,0,0
88,100504,2017,4,0,0,0,0,0.0,0,0
101,100504,2017,5,0,0,0,0,0.0,0,0
115,106001,2017,1,5,0,5,910,91.0,0,0
...,...,...,...,...,...,...,...,...,...,...
405619,999902,2018,12,0,0,0,0,0.0,0,0
405620,999911,2018,12,0,0,0,0,0.0,0,0
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0



>>> VALUES COUNT:
974875    48
736504    48
890702    48
617489    48
200519    47
          ..
428402    24
428452    24
428520    24
428523    24
999891    24
Name: loyalty_number, Length: 16737, dtype: int64

>>> VALUES WITH MORE THAN 24 ROWS:
Int64Index([974875, 736504, 890702, 617489, 200519, 528447, 114414, 713132,
            965605, 499874,
            ...
            950304, 750578, 208241, 411030, 106509, 536021, 308480, 831341,
            841296, 746226],
           dtype='int64', length=135)


In [11]:
filtered_df = df_customer_flights[df_customer_flights['loyalty_number'].isin(values_to_keep)].sort_values(by=['loyalty_number','year', 'month','total_flights'])
filtered_df.head(20)

Unnamed: 0,loyalty_number,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed
41,101902,2017,1,0,0,0,0,0.0,0,0
16942,101902,2017,2,0,0,0,0,0.0,0,0
33843,101902,2017,3,0,0,0,0,0.0,0,0
50744,101902,2017,4,4,0,4,1460,146.0,0,0
185796,101902,2017,4,4,4,8,2384,238.0,488,40
67646,101902,2017,5,7,0,7,3318,331.0,0,0
67645,101902,2017,5,9,3,12,2748,274.0,0,0
84547,101902,2017,6,9,0,9,1521,152.0,0,0
84546,101902,2017,6,10,5,15,3015,301.0,0,0
101447,101902,2017,7,0,0,0,0,0.0,0,0


In [12]:
df_customer_loyalty[df_customer_loyalty['loyalty_number'].isin(values_to_keep)]

Unnamed: 0,loyalty_number,country,province,city,postal_code,gender,education,salary,marital_status,loyalty_card,clv,enrollment_type,enrollment_year,enrollment_month,cancellation_year,cancellation_month
5,193662,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,51124.0,Married,Star,3844.57,Standard,2012,5,Not Cancelled,Not Cancelled
123,746226,Canada,British Columbia,Whistler,V6T 1Y8,Female,Bachelor,63501.0,Married,Star,4089.04,Standard,2018,1,Not Cancelled,Not Cancelled
141,279419,Canada,British Columbia,West Vancouver,V6V 8Z3,Female,College,73455.0,Single,Star,4117.37,Standard,2013,7,Not Cancelled,Not Cancelled
161,354438,Canada,Quebec,Montreal,H2T 2J6,Male,College,73455.0,Married,Star,4167.09,Standard,2018,9,Not Cancelled,Not Cancelled
537,411523,Canada,Alberta,Edmonton,T9G 1W3,Male,Bachelor,62761.0,Single,Star,4747.73,Standard,2016,10,Not Cancelled,Not Cancelled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14233,411030,Canada,Ontario,Toronto,P1J 8T7,Female,College,73455.0,Single,Star,6503.14,Standard,2017,4,Not Cancelled,Not Cancelled
14263,732304,Canada,Quebec,Quebec City,G1B 3L5,Male,Bachelor,82727.0,Married,Nova,5715.79,2018 Promotion,2018,4,Not Cancelled,Not Cancelled
14579,243741,Canada,British Columbia,Vancouver,V6E 3D9,Female,Bachelor,93595.0,Married,Star,7425.85,Standard,2013,4,Not Cancelled,Not Cancelled
14937,897772,Canada,Quebec,Montreal,H2Y 4R4,Male,High School or Below,44490.0,Divorced,Star,8123.96,Standard,2012,9,Not Cancelled,Not Cancelled
