# Test Data

In [1]:
# Import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load the dataset
test_df = pd.read_excel('Test_set.xlsx')

In [3]:
test_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [4]:
test_df.tail()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
2666,Air India,6/06/2019,Kolkata,Banglore,CCU → DEL → BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info
2667,IndiGo,27/03/2019,Kolkata,Banglore,CCU → BLR,14:20,16:55,2h 35m,non-stop,No info
2668,Jet Airways,6/03/2019,Delhi,Cochin,DEL → BOM → COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info
2669,Air India,6/03/2019,Delhi,Cochin,DEL → BOM → COK,04:00,19:15,15h 15m,1 stop,No info
2670,Multiple carriers,15/06/2019,Delhi,Cochin,DEL → BOM → COK,04:55,19:15,14h 20m,1 stop,No info


In [5]:
test_df.shape

(2671, 10)

In [6]:
# Preprocessing

print("Test Data Info")
print('-'*75)
print(test_df.info())

print()

print()

print('Null Values :')
print('-'*75)
print(test_df.isna().sum())

# EDA

# Changing Datatypes of 3 columns into datetime format.
def change_into_datetime(col):
    test_df[col] = pd.to_datetime(test_df[col],infer_datetime_format=True)
    
for i in ['Date_of_Journey','Arrival_Time','Dep_Time']:
    change_into_datetime(i)
    
# Date of Journey

# Adding Day and Month column at the end of df
test_df['Day'] = test_df['Date_of_Journey'].dt.day
test_df['Month'] = test_df['Date_of_Journey'].dt.month

# Dropping Date_of_Journey
test_df.drop('Date_of_Journey',axis=1, inplace=True)

# Arrival Time

test_df['Arr_Hour'] = test_df['Arrival_Time'].dt.hour
test_df['Arr_Min'] = test_df['Arrival_Time'].dt.minute

# Dropping the column Arrival Time
test_df.drop('Arrival_Time', axis=1, inplace=True)


# Departure Time

test_df['Dep_Hour'] = test_df['Dep_Time'].dt.hour
test_df['Dep_Min'] = test_df['Dep_Time'].dt.minute

# Dropping the column Arrival Time
test_df.drop('Dep_Time', axis=1, inplace=True)

# Duration

# Assigning New columns to the end of the dataframe
test_df['Duration_Hour'] = test_df['Duration'].str.split(' ').str[0].str.split('h').str[0]
test_df['Duration_Min'] = test_df['Duration'].str.split(' ').str[1].str.split('m').str[0]

# Fill null values with 0
test_df['Duration_Hour'] = test_df['Duration_Hour'].fillna(0)
test_df['Duration_Min'] = test_df['Duration_Min'].fillna(0)

# Fixing the error in the dataset
# test_df[test_df['Duration_Hour']=='5m'] 
test_df.drop(2660, axis=0, inplace=True)

# Changing Data Type to 'int'
test_df['Duration_Hour'] = test_df['Duration_Hour'].astype('int64')
test_df['Duration_Min'] = test_df['Duration_Min'].astype('int64')

test_df.drop('Duration',axis=1, inplace=True)

# Categorical Data

print("Airline")
print("-"*75)
print(test_df["Airline"].value_counts())
Airline = pd.get_dummies(test_df["Airline"], drop_first= True)

print()

print("Source")
print("-"*75)
print(test_df["Source"].value_counts())
Source = pd.get_dummies(test_df["Source"], drop_first= True)

print()

print("Destination")
print("-"*75)
print(test_df["Destination"].value_counts())
Destination = pd.get_dummies(test_df["Destination"], drop_first= True)

# # Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other

test_df.drop(['Additional_Info','Route'],axis=1, inplace=True)

# Replace Total Stops 

test_df['Total_Stops'] = test_df['Total_Stops'].map({'non-stop':0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})


# Concatenate all dfs

final_test_df = pd.concat([test_df, Airline, Source, Destination], axis=1)

final_test_df.drop(['Airline','Source','Destination'],axis=1, inplace=True)

print()
print()

print("Shape of Test Data: ",final_test_df.shape)


Test Data Info
---------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          2671 non-null   object
 1   Date_of_Journey  2671 non-null   object
 2   Source           2671 non-null   object
 3   Destination      2671 non-null   object
 4   Route            2671 non-null   object
 5   Dep_Time         2671 non-null   object
 6   Arrival_Time     2671 non-null   object
 7   Duration         2671 non-null   object
 8   Total_Stops      2671 non-null   object
 9   Additional_Info  2671 non-null   object
dtypes: object(10)
memory usage: 208.8+ KB
None


Null Values :
---------------------------------------------------------------------------
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time         

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

Airline
---------------------------------------------------------------------------
Jet Airways                          897
IndiGo                               511
Air India                            439
Multiple carriers                    347
SpiceJet                             208
Vistara                              129
Air Asia                              86
GoAir                                 46
Multiple carriers Premium economy      3
Vistara Premium economy                2
Jet Airways Business                   2
Name: Airline, dtype: int64

Source
---------------------------------------------------------------------------
Delhi       1145
Kolkata      710
Banglore     555
Mumbai       185
Chennai       75
Name: Source, dtype: int64

Destination
---------------------------------------------------------------------------
Cochin       1145
Banglore      710
Delhi         317
New Delhi     238
Hyderabad     185
Kolkata        75
Name: Destination, dtype: int64


Shape of T

In [7]:
final_test_df.head()

Unnamed: 0,Total_Stops,Day,Month,Arr_Hour,Arr_Min,Dep_Hour,Dep_Min,Duration_Hour,Duration_Min,Air India,...,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,1,6,6,4,25,17,30,10,55,0,...,0,0,1,0,0,1,0,0,0,0
1,1,5,12,10,20,6,20,4,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,21,5,19,0,19,15,23,45,0,...,0,0,1,0,0,1,0,0,0,0
3,1,21,5,21,0,8,0,13,0,0,...,0,0,1,0,0,1,0,0,0,0
4,0,24,6,2,45,23,55,2,50,0,...,0,0,0,0,0,0,1,0,0,0


## Saving the final test df into CSV file

In [8]:
final_test_df.to_csv('final_test_df.csv', index=False)