# <a id='toc1_'></a>[Анализ удовлетворённости клиентов авиакомпании с помощью PySpark](#toc0_)

Цель проекта: определить влияние различных факторов на уровень удавлетворённости клиентов.

[Ссылка на датасет](https://www.kaggle.com/datasets/sjleshrac/airlines-customer-satisfaction/data)

## Загрузка данных

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import  SparkContext
from pyspark.sql.functions import col, lit
from pyspark.sql import functions as F
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_air')\
        .getOrCreate()

In [3]:
# чтение CSV файла
data = spark.read.csv(
    'invistico_airline.csv',
    sep=',',
    header=True,
    inferSchema=True
)

In [4]:
data.printSchema()

root
 |-- satisfaction: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Customer Type: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Type of Travel: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Flight Distance: integer (nullable = true)
 |-- Seat comfort: integer (nullable = true)
 |-- Departure/Arrival time convenient: integer (nullable = true)
 |-- Food and drink: integer (nullable = true)
 |-- Gate location: integer (nullable = true)
 |-- Inflight wifi service: integer (nullable = true)
 |-- Inflight entertainment: integer (nullable = true)
 |-- Online support: integer (nullable = true)
 |-- Ease of Online booking: integer (nullable = true)
 |-- On-board service: integer (nullable = true)
 |-- Leg room service: integer (nullable = true)
 |-- Baggage handling: integer (nullable = true)
 |-- Checkin service: integer (nullable = true)
 |-- Cleanliness: integer (nullable = true)
 |-- Online boarding: integer (nullable = true)

## Проверка данных и предобработка

In [5]:
data = data.select([F.col(x).alias(x.lower()) for x in data.columns])
data = data.select([F.col(x).alias(x.replace(' ', '_')) for x in data.columns])
data = data.select([F.col(x).alias(x.replace('/', '_')) for x in data.columns])
data = data.select([F.col(x).alias(x.replace('-', '_')) for x in data.columns])
data.show(2)

+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+--------------+----------------------+----------------+----------------+----------------+---------------+-----------+---------------+--------------------------+------------------------+
|satisfaction|gender| customer_type|age| type_of_travel|   class|flight_distance|seat_comfort|departure_arrival_time_convenient|food_and_drink|gate_location|inflight_wifi_service|inflight_entertainment|online_support|ease_of_online_booking|on_board_service|leg_room_service|baggage_handling|checkin_service|cleanliness|online_boarding|departure_delay_in_minutes|arrival_delay_in_minutes|
+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+--------------+-------

In [6]:
data.head(3)

[Row(satisfaction='satisfied', gender='Female', customer_type='Loyal Customer', age=65, type_of_travel='Personal Travel', class='Eco', flight_distance=265, seat_comfort=0, departure_arrival_time_convenient=0, food_and_drink=0, gate_location=2, inflight_wifi_service=2, inflight_entertainment=4, online_support=2, ease_of_online_booking=3, on_board_service=3, leg_room_service=0, baggage_handling=3, checkin_service=5, cleanliness=3, online_boarding=2, departure_delay_in_minutes=0, arrival_delay_in_minutes=0),
 Row(satisfaction='satisfied', gender='Male', customer_type='Loyal Customer', age=47, type_of_travel='Personal Travel', class='Business', flight_distance=2464, seat_comfort=0, departure_arrival_time_convenient=0, food_and_drink=0, gate_location=3, inflight_wifi_service=0, inflight_entertainment=2, online_support=2, ease_of_online_booking=3, on_board_service=4, leg_room_service=4, baggage_handling=4, checkin_service=2, cleanliness=3, online_boarding=2, departure_delay_in_minutes=310, a

In [7]:
data.describe()

DataFrame[summary: string, satisfaction: string, gender: string, customer_type: string, age: string, type_of_travel: string, class: string, flight_distance: string, seat_comfort: string, departure_arrival_time_convenient: string, food_and_drink: string, gate_location: string, inflight_wifi_service: string, inflight_entertainment: string, online_support: string, ease_of_online_booking: string, on_board_service: string, leg_room_service: string, baggage_handling: string, checkin_service: string, cleanliness: string, online_boarding: string, departure_delay_in_minutes: string, arrival_delay_in_minutes: string]

In [8]:
data.toPandas().nunique()

satisfaction                            2
gender                                  2
customer_type                           2
age                                    75
type_of_travel                          2
class                                   3
flight_distance                      5398
seat_comfort                            6
departure_arrival_time_convenient       6
food_and_drink                          6
gate_location                           6
inflight_wifi_service                   6
inflight_entertainment                  6
online_support                          6
ease_of_online_booking                  6
on_board_service                        6
leg_room_service                        6
baggage_handling                        5
checkin_service                         6
cleanliness                             6
online_boarding                         6
departure_delay_in_minutes            466
arrival_delay_in_minutes              472
dtype: int64

In [9]:
data.toPandas()['satisfaction'].unique()

array(['satisfied', 'dissatisfied'], dtype=object)

In [10]:
data = data.withColumn('total', data.seat_comfort + data.departure_arrival_time_convenient + \
               data.food_and_drink + data.gate_location + data.inflight_wifi_service + \
               data.inflight_entertainment + data.online_support + data.ease_of_online_booking + \
               data.on_board_service + data.leg_room_service + data.baggage_handling + \
               data.checkin_service + data.cleanliness + data.online_boarding)
data.show(5)

+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+--------------+----------------------+----------------+----------------+----------------+---------------+-----------+---------------+--------------------------+------------------------+-----+
|satisfaction|gender| customer_type|age| type_of_travel|   class|flight_distance|seat_comfort|departure_arrival_time_convenient|food_and_drink|gate_location|inflight_wifi_service|inflight_entertainment|online_support|ease_of_online_booking|on_board_service|leg_room_service|baggage_handling|checkin_service|cleanliness|online_boarding|departure_delay_in_minutes|arrival_delay_in_minutes|total|
+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+----------

In [11]:
data.dtypes

[('satisfaction', 'string'),
 ('gender', 'string'),
 ('customer_type', 'string'),
 ('age', 'int'),
 ('type_of_travel', 'string'),
 ('class', 'string'),
 ('flight_distance', 'int'),
 ('seat_comfort', 'int'),
 ('departure_arrival_time_convenient', 'int'),
 ('food_and_drink', 'int'),
 ('gate_location', 'int'),
 ('inflight_wifi_service', 'int'),
 ('inflight_entertainment', 'int'),
 ('online_support', 'int'),
 ('ease_of_online_booking', 'int'),
 ('on_board_service', 'int'),
 ('leg_room_service', 'int'),
 ('baggage_handling', 'int'),
 ('checkin_service', 'int'),
 ('cleanliness', 'int'),
 ('online_boarding', 'int'),
 ('departure_delay_in_minutes', 'int'),
 ('arrival_delay_in_minutes', 'int'),
 ('total', 'int')]

## Обзор совокупного показателя

In [12]:
data.agg({"total": "max"}).collect()[0][0]

70

In [13]:
data.filter(col('total') == lit(70)).show(5)

+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+--------------+----------------------+----------------+----------------+----------------+---------------+-----------+---------------+--------------------------+------------------------+-----+
|satisfaction|gender| customer_type|age| type_of_travel|   class|flight_distance|seat_comfort|departure_arrival_time_convenient|food_and_drink|gate_location|inflight_wifi_service|inflight_entertainment|online_support|ease_of_online_booking|on_board_service|leg_room_service|baggage_handling|checkin_service|cleanliness|online_boarding|departure_delay_in_minutes|arrival_delay_in_minutes|total|
+------------+------+--------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+----------

In [14]:
data.select(F.min(F.col("total")).alias("MIN")).limit(1).collect()[0].MIN

15

In [15]:
data.filter(col('total') < lit(18)).show(50)

+------------+------+-----------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+--------------+----------------------+----------------+----------------+----------------+---------------+-----------+---------------+--------------------------+------------------------+-----+
|satisfaction|gender|    customer_type|age| type_of_travel|   class|flight_distance|seat_comfort|departure_arrival_time_convenient|food_and_drink|gate_location|inflight_wifi_service|inflight_entertainment|online_support|ease_of_online_booking|on_board_service|leg_room_service|baggage_handling|checkin_service|cleanliness|online_boarding|departure_delay_in_minutes|arrival_delay_in_minutes|total|
+------------+------+-----------------+---+---------------+--------+---------------+------------+---------------------------------+--------------+-------------+---------------------+----------------------+-

## Средние показатели

In [16]:
all = data.groupBy("satisfaction", "gender", "customer_type", "type_of_travel", "class") \
    .agg(F.avg("total").alias("total"),
         F.avg("seat_comfort").alias("seat_comfort"), 
         F.avg("departure_arrival_time_convenient").alias("departure_arrival_time_convenient"), 
         F.avg("food_and_drink").alias("food_and_drink"),
         F.avg("gate_location").alias("gate_location"), 
         F.avg("inflight_wifi_service").alias("inflight_wifi_service"), 
         F.avg("inflight_entertainment").alias("inflight_entertainment"), 
         F.avg("online_support").alias("online_support"), 
         F.avg("ease_of_online_booking").alias("ease_of_online_booking"), 
         F.avg("on_board_service").alias("on_board_service"), 
         F.avg("leg_room_service").alias("leg_room_service"), 
         F.avg("baggage_handling").alias("baggage_handling"), 
         F.avg("checkin_service").alias("checkin_service"), 
         F.avg("cleanliness").alias("cleanliness"), 
         F.avg("online_boarding").alias("online_boarding")
      ).orderBy(
          ["satisfaction", "customer_type", "gender", "type_of_travel", "class"], 
          ascending=False).show(43, truncate=True)

+------------+------+-----------------+---------------+--------+------------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|satisfaction|gender|    customer_type| type_of_travel|   class|             total|      seat_comfort|departure_arrival_time_convenient|    food_and_drink|     gate_location|inflight_wifi_service|inflight_entertainment|    online_support|ease_of_online_booking|  on_board_service|  leg_room_service|  baggage_handling|   checkin_service|       cleanliness|   online_boarding|
+------------+------+-----------------+---------------+--------+------------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------

In [17]:
data.groupBy("satisfaction", "gender", "customer_type") \
    .agg(F.avg("total").alias("total"),
         F.avg("seat_comfort").alias("seat_comfort"), 
         F.avg("departure_arrival_time_convenient").alias("departure_arrival_time_convenient"), 
         F.avg("food_and_drink").alias("food_and_drink"),
         F.avg("gate_location").alias("gate_location"), 
         F.avg("inflight_wifi_service").alias("inflight_wifi_service"), 
         F.avg("inflight_entertainment").alias("inflight_entertainment"), 
         F.avg("online_support").alias("online_support"), 
         F.avg("ease_of_online_booking").alias("ease_of_online_booking"), 
         F.avg("on_board_service").alias("on_board_service"), 
         F.avg("leg_room_service").alias("leg_room_service"), 
         F.avg("baggage_handling").alias("baggage_handling"), 
         F.avg("checkin_service").alias("checkin_service"), 
         F.avg("cleanliness").alias("cleanliness"), 
         F.avg("online_boarding").alias("online_boarding")
      ).orderBy(['satisfaction', 'customer_type', 'gender'], 
                  ascending=True).show(truncate=False)

+------------+------+-----------------+------------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|satisfaction|gender|customer_type    |total             |seat_comfort      |departure_arrival_time_convenient|food_and_drink    |gate_location     |inflight_wifi_service|inflight_entertainment|online_support    |ease_of_online_booking|on_board_service  |leg_room_service  |baggage_handling  |checkin_service   |cleanliness       |online_boarding   |
+------------+------+-----------------+------------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+-----

In [18]:
data.groupBy("satisfaction", "type_of_travel", "class") \
    .agg(F.avg("total").alias("total"),
         F.avg("seat_comfort").alias("seat_comfort"), 
         F.avg("departure_arrival_time_convenient").alias("departure_arrival_time_convenient"), 
         F.avg("food_and_drink").alias("food_and_drink"),
         F.avg("gate_location").alias("gate_location"), 
         F.avg("inflight_wifi_service").alias("inflight_wifi_service"), 
         F.avg("inflight_entertainment").alias("inflight_entertainment"), 
         F.avg("online_support").alias("online_support"), 
         F.avg("ease_of_online_booking").alias("ease_of_online_booking"), 
         F.avg("on_board_service").alias("on_board_service"), 
         F.avg("leg_room_service").alias("leg_room_service"), 
         F.avg("baggage_handling").alias("baggage_handling"), 
         F.avg("checkin_service").alias("checkin_service"), 
         F.avg("cleanliness").alias("cleanliness"), 
         F.avg("online_boarding").alias("on_online_boarding")
      ).orderBy(['satisfaction', 'type_of_travel', 'class'], 
                  ascending=True).show(truncate=False)

+------------+---------------+--------+-----------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|satisfaction|type_of_travel |class   |total            |seat_comfort      |departure_arrival_time_convenient|food_and_drink    |gate_location     |inflight_wifi_service|inflight_entertainment|online_support    |ease_of_online_booking|on_board_service  |leg_room_service  |baggage_handling  |checkin_service   |cleanliness       |on_online_boarding|
+------------+---------------+--------+-----------------+------------------+---------------------------------+------------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+--------

## Матрица корреляций

In [19]:
data_new = data[['seat_comfort', 'departure_arrival_time_convenient', \
               'food_and_drink', 'gate_location', 'inflight_wifi_service', \
               'inflight_entertainment', 'online_support', 'ease_of_online_booking', \
               'on_board_service', 'leg_room_service', 'baggage_handling', \
               'checkin_service', 'cleanliness', 'online_boarding', 'total']]

vector_col = 'satisfaction'
assembler = VectorAssembler(inputCols=data_new.columns, outputCol=vector_col)
df_vector = assembler.transform(data_new).select(vector_col)

corr_matrix = Correlation.corr(df_vector, vector_col)

corr_matrix.collect()[0]['pearson({})'.format(vector_col)].values

array([ 1.00000000e+00,  4.34951201e-01,  7.15997131e-01,  4.05299613e-01,
        1.29130660e-01,  4.26447115e-01,  1.20278266e-01,  2.11530683e-01,
        1.21148837e-01,  1.36135696e-01,  1.18488629e-01,  4.12136522e-02,
        1.08475074e-01,  1.30396484e-01,  5.95781118e-01,  4.34951201e-01,
        1.00000000e+00,  5.27381194e-01,  5.44258524e-01, -1.97370209e-03,
        7.72029136e-02, -5.46247124e-04,  1.75548932e-03,  6.04463693e-02,
        2.94218568e-02,  6.73961551e-02,  6.27937697e-02,  6.71257030e-02,
       -6.23459767e-04,  4.30731949e-01,  7.15997131e-01,  5.27381194e-01,
        1.00000000e+00,  5.23327870e-01,  2.59168699e-02,  3.67669101e-01,
        2.85538946e-02,  4.11888960e-02,  3.86460609e-02,  7.41605296e-02,
        3.63700033e-02,  1.18979052e-02,  3.34159032e-02,  1.35871843e-02,
        5.10543574e-01,  4.05299613e-01,  5.44258524e-01,  5.23327870e-01,
        1.00000000e+00, -4.08628356e-03, -2.80114218e-04,  2.90833737e-03,
        1.44187714e-03, -