In [101]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


airline = pd.read_csv("./customer_airways_data.csv", encoding="iso-8859-1")
reviews = pd.read_csv("./cleaned-reviews.csv")


#### Descripcion del set de datos. Vemos cuales son variables numericas y cuales categoricas

In [102]:
print(airline.shape)
airline.info()

(50000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory 

#### Veremos si tenemos:
 1. Datos faltantes
 3. Registros repetidos
 4. Outliers
 5. Errores tipograficos

#### 1. Datos faltantes

In [104]:
airline.isnull().sum()

num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64

#### 2. Registros repetidos

In [105]:
airline.duplicated().value_counts()


False    49281
True       719
dtype: int64

##### 2.1 Elimino las filas repetidas

In [106]:
airline.drop_duplicates(inplace=True)
airline.shape

(49281, 14)

In [107]:
origins = airline.booking_origin.value_counts()
origins = origins.to_frame().reset_index().rename(columns={"index":"Booking_origin","booking_origin":"quantity"})
origins = origins["Booking_origin"]
origins[0:25]

0          Australia
1           Malaysia
2        South Korea
3              Japan
4              China
5          Indonesia
6             Taiwan
7           Thailand
8              India
9        New Zealand
10         Singapore
11     United States
12           Vietnam
13             Macau
14         Hong Kong
15       Philippines
16    United Kingdom
17            Brunei
18          Cambodia
19         (not set)
20         Sri Lanka
21             Italy
22            France
23            Canada
24           Germany
Name: Booking_origin, dtype: object

In [108]:
airline.loc[airline.booking_origin == "(not set)","booking_origin"].count()

78

In [109]:
airline = airline.loc[airline.booking_origin != "(not set)",:]
airline.shape

(49203, 14)

#### 3. Outliers

In [110]:
airline.describe()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
count,49203.0,49203.0,49203.0,49203.0,49203.0,49203.0,49203.0,49203.0,49203.0
mean,1.590005,84.732293,23.061968,9.070951,0.668293,0.295714,0.426783,7.280812,0.14993
std,1.016291,90.437986,33.847217,5.410752,0.470831,0.456368,0.494615,1.496137,0.357006
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.67,0.0
25%,1.0,21.0,5.0,5.0,0.0,0.0,0.0,5.62,0.0
50%,1.0,51.0,17.0,9.0,1.0,0.0,0.0,7.57,0.0
75%,2.0,115.0,28.0,13.0,1.0,1.0,1.0,8.83,0.0
max,9.0,867.0,778.0,23.0,1.0,1.0,1.0,9.5,1.0


In [111]:
airline["trip_type"].value_counts()

RoundTrip     48702
OneWay          385
CircleTrip      116
Name: trip_type, dtype: int64

In [112]:
airline["flight_day"].value_counts()

Mon    7974
Wed    7548
Tue    7545
Thu    7310
Fri    6674
Sun    6430
Sat    5722
Name: flight_day, dtype: int64

In [113]:
print(reviews.shape)
reviews.info()

(3411, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3411 non-null   int64 
 1   reviews     3411 non-null   object
 2   rates       3411 non-null   int64 
 3   date        3411 non-null   object
 4   country     3411 non-null   object
 5   verified    3411 non-null   bool  
 6   comments    3411 non-null   object
dtypes: bool(1), int64(2), object(4)
memory usage: 163.3+ KB


#### 1. Datos faltantes

In [115]:
reviews.isnull().sum().sum()

0

#### 2. Registros repetidos

In [117]:
reviews.duplicated().value_counts()

False    3411
dtype: int64

#### 3. Outliers

In [119]:
reviews.describe()

Unnamed: 0.1,Unnamed: 0,rates
count,3411.0,3411.0
mean,1705.668133,4.821167
std,985.831888,3.145863
min,0.0,1.0
25%,852.5,2.0
50%,1705.0,4.0
75%,2557.5,8.0
max,3417.0,10.0
