# Data Analysis on Airline Dataset

### Importing required packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../input/Airline Dataset.csv
../input/Airline Dataset Updated.csv
../input/Airline Dataset Updated - v2.csv


### Reading the Data

In [2]:
df=pd.read_csv("../input/Airline Dataset Updated.csv", parse_dates=["Departure Date"])

In [3]:
df = df
intended_df_size_in_MB = 256
factor = intended_df_size_in_MB*(2**20)/df.memory_usage(index=True).sum()
if int(factor) > 0:
    df = pd.concat([df]*int(factor), ignore_index=True)
else:
    rowCount = int(df.shape[0]*factor)
    df = df[0:rowCount]
df = df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2169618 entries, 0 to 2169617
Data columns (total 15 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Passenger ID          object        
 1   First Name            object        
 2   Last Name             object        
 3   Gender                object        
 4   Age                   int64         
 5   Nationality           object        
 6   Airport Name          object        
 7   Airport Country Code  object        
 8   Country Name          object        
 9   Airport Continent     object        
 10  Continents            object        
 11  Departure Date        datetime64[ns]
 12  Arrival Airport       object        
 13  Pilot Name            object        
 14  Flight Status         object        
dtypes: datetime64[ns](1), int64(1), object(13)
memory usage: 248.3+ MB


### Overview of the Data

In [4]:
df.head(11)

Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,2022-06-28,CXF,Edithe Leggis,On Time
1,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,2022-12-26,YCO,Elwood Catt,On Time
2,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,2022-01-18,GNB,Darby Felgate,On Time
3,BRS38V,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,2022-09-16,YND,Dominica Pyle,Delayed
4,9kvTLo,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,2022-02-25,SEE,Bay Pencost,On Time
5,nMJKVh,Lora,Durbann,Female,55,Brazil,Coronel Horácio de Mattos Airport,BR,Brazil,SAM,South America,2022-06-10,LEC,Lora Durbann,On Time
6,8IPFPE,Rand,Bram,Male,73,Ivory Coast,Duxford Aerodrome,GB,United Kingdom,EU,Europe,2022-10-30,QFO,Rand Bram,Cancelled
7,pqixbY,Perceval,Dallosso,Male,36,Vietnam,Maestro Wilson Fonseca Airport,BR,Brazil,SAM,South America,2022-04-07,STM,Perceval Dallosso,Cancelled
8,QNAs2R,Aleda,Pigram,Female,35,Palestinian Territory,Venice Marco Polo Airport,IT,Italy,EU,Europe,2022-08-20,VCE,Aleda Pigram,On Time
9,3jmudz,Burlie,Schustl,Male,13,Thailand,Vermilion Airport,CA,Canada,NAM,North America,2022-04-06,YVG,Burlie Schustl,On Time


In [5]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2169618 entries, 0 to 2169617
Data columns (total 15 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Passenger ID          object        
 1   First Name            object        
 2   Last Name             object        
 3   Gender                object        
 4   Age                   int64         
 5   Nationality           object        
 6   Airport Name          object        
 7   Airport Country Code  object        
 8   Country Name          object        
 9   Airport Continent     object        
 10  Continents            object        
 11  Departure Date        datetime64[ns]
 12  Arrival Airport       object        
 13  Pilot Name            object        
 14  Flight Status         object        
dtypes: datetime64[ns](1), int64(1), object(13)
memory usage: 248.3+ MB


In [6]:
df.nunique()

Passenger ID            98619
First Name               8437
Last Name               41658
Gender                      2
Age                        90
Nationality               240
Airport Name             9062
Airport Country Code      235
Country Name              235
Airport Continent           6
Continents                  6
Departure Date            364
Arrival Airport          9024
Pilot Name              98610
Flight Status               3
dtype: int64

### Data Preparation

In [7]:
df.isnull().sum().sum() #Checking if Null value is present

0

In [8]:
df['Gender']=df['Gender'].astype('category')
df['Gender'].unique()

['Female', 'Male']
Categories (2, object): ['Female', 'Male']

In [9]:
df['Age']=df['Age'].astype(int)

### Exploratory Data Analysis

In [10]:
columns = [key for key, value in df.nunique().to_dict().items() if value < 10]

In [11]:
count_1 = df['Gender'].value_counts()
count_1

Male      1091156
Female    1078462
Name: Gender, dtype: int64

In [12]:
count_2 = df['Country Name'].value_counts()

In [13]:
df_c= pd.DataFrame(count_2).reset_index().rename(columns={"index": "value", 0: "count"})

In [14]:
con=df.loc[:,['Country Name','Continents']]
con=con.drop_duplicates()
#new_df=pd.merge_ordered(df_c,con, fill_method= 'ffill', on= 'Country Name')

In [15]:
nationality = pd.DataFrame(df['Nationality'].value_counts()).reset_index()

In [16]:
nationality = nationality.head(20)

In [17]:
nationality.sort_values(by='Nationality', ascending=False, inplace=True)

In [18]:
fl_stat = pd.DataFrame(df['Flight Status'].value_counts()).reset_index()