In [4]:
import pandas as pd

In [5]:
train_df = pd.read_excel('../../Data/Data_Train.xlsx')
train_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [6]:
import time

In [7]:
def datetime_converter(dj, at, dt):
    
    train_df['Date'] = pd.to_datetime(dj, format='%d/%m/%Y')
    train_df['Dep_T'] = pd.to_datetime(dt, format='%H:%M')

    # Attempt to parse Arrival_Time with both formats
    for i, time_str in enumerate(at):
        try:
            train_df.loc[i, 'Arrival_T'] = pd.to_datetime(time_str, format='%H:%M %d %b')
        except ValueError:
            try:
                train_df.loc[i, 'Arrival_T'] = pd.to_datetime(time_str, format='%H:%M')
            except ValueError:
                raise ValueError(f"Inconsistent format in Arrival_Time column at position {i}: {time_str}")

    train_df['Dep_T'] = pd.to_datetime(train_df['Date'].dt.date.astype(str) + ' ' + train_df['Dep_T'].dt.time.astype(str))
    train_df['Arrival_T'] = pd.to_datetime(train_df['Date'].dt.date.astype(str) + ' ' + train_df['Arrival_T'].dt.time.astype(str))

    return train_df[['Date', 'Arrival_T', 'Dep_T']]

In [8]:
train_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [9]:
datetime_converter(train_df['Date_of_Journey'], train_df['Arrival_Time'], train_df['Dep_Time'])

Unnamed: 0,Date,Arrival_T,Dep_T
0,2019-03-24,2019-03-24 01:10:00,2019-03-24 22:20:00
1,2019-05-01,2019-05-01 13:15:00,2019-05-01 05:50:00
2,2019-06-09,2019-06-09 04:25:00,2019-06-09 09:25:00
3,2019-05-12,2019-05-12 23:30:00,2019-05-12 18:05:00
4,2019-03-01,2019-03-01 21:35:00,2019-03-01 16:50:00
...,...,...,...
10678,2019-04-09,2019-04-09 22:25:00,2019-04-09 19:55:00
10679,2019-04-27,2019-04-27 23:20:00,2019-04-27 20:45:00
10680,2019-04-27,2019-04-27 11:20:00,2019-04-27 08:20:00
10681,2019-03-01,2019-03-01 14:10:00,2019-03-01 11:30:00


In [10]:
train_df.drop(['Dep_Time', 'Arrival_Time', 'Date_of_Journey'], axis=1, inplace=True)


In [11]:
train_df.columns

Index(['Airline', 'Source', 'Destination', 'Route', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Date', 'Dep_T', 'Arrival_T'],
      dtype='object')

In [12]:
test_df = pd.read_excel('../../Data/Test_set.xlsx')
test_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [13]:
test_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'],
      dtype='object')

In [16]:
null_values = train_df.isnull().sum()
null_values

Airline               0
Source                0
Destination           0
Route                 1
Duration              0
Total_Stops           1
Additional_Info       0
Price                 0
Date               8012
Dep_T              8012
Arrival_T             0
dtype: int64

In [17]:
train_df.dropna(inplace=True)

In [22]:
def datetime_converter_test(dj, at, dt):
    
    test_df['Date'] = pd.to_datetime(dj, format='%d/%m/%Y')
    test_df['Dep_T'] = pd.to_datetime(dt, format='%H:%M')

    # Attempt to parse Arrival_Time with both formats
    for i, time_str in enumerate(at):
        try:
            test_df.loc[i, 'Arrival_T'] = pd.to_datetime(time_str, format='%H:%M %d %b')
        except ValueError:
            try:
                test_df.loc[i, 'Arrival_T'] = pd.to_datetime(time_str, format='%H:%M')
            except ValueError:
                raise ValueError(f"Inconsistent format in Arrival_Time column at position {i}: {time_str}")

    test_df['Dep_T'] = pd.to_datetime(test_df['Date'].dt.date.astype(str) + ' ' + test_df['Dep_T'].dt.time.astype(str))
    test_df['Arrival_T'] = pd.to_datetime(test_df['Date'].dt.date.astype(str) + ' ' + test_df['Arrival_T'].dt.time.astype(str))

    return test_df[['Date', 'Arrival_T', 'Dep_T']]

In [23]:
datetime_converter_test(test_df['Date_of_Journey'], test_df['Arrival_Time'], test_df['Dep_Time'])

Unnamed: 0,Date,Arrival_T,Dep_T
0,2019-06-06,2019-06-06 04:25:00,2019-06-06 17:30:00
1,2019-05-12,2019-05-12 10:20:00,2019-05-12 06:20:00
2,2019-05-21,2019-05-21 19:00:00,2019-05-21 19:15:00
3,2019-05-21,2019-05-21 21:00:00,2019-05-21 08:00:00
4,2019-06-24,2019-06-24 02:45:00,2019-06-24 23:55:00
...,...,...,...
2666,2019-06-06,2019-06-06 20:25:00,2019-06-06 20:30:00
2667,2019-03-27,2019-03-27 16:55:00,2019-03-27 14:20:00
2668,2019-03-06,2019-03-06 04:25:00,2019-03-06 21:50:00
2669,2019-03-06,2019-03-06 19:15:00,2019-03-06 04:00:00


In [24]:
test_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Date', 'Dep_T', 'Arrival_T'],
      dtype='object')

In [26]:
to_select = ['Date', 'Dep_T', 'Arrival_T']
x=train_df.loc[:, to_select]
y=test_df.loc[:, to_select]

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=42)

In [37]:
x_train = X_train[to_select].values
x_test = X_test[to_select].values
x_train, x_test

(array([['2019-06-12T00:00:00.000000000', '2019-06-12T17:00:00.000000000',
         '2019-06-12T19:45:00.000000000'],
        ['2019-04-01T00:00:00.000000000', '2019-04-01T16:30:00.000000000',
         '2019-04-01T09:20:00.000000000'],
        ['2019-03-09T00:00:00.000000000', '2019-03-09T08:50:00.000000000',
         '2019-03-09T23:55:00.000000000'],
        ...,
        ['2019-06-09T00:00:00.000000000', '2019-06-09T10:00:00.000000000',
         '2019-06-09T05:25:00.000000000'],
        ['2019-06-01T00:00:00.000000000', '2019-06-01T20:55:00.000000000',
         '2019-06-01T04:25:00.000000000'],
        ['2019-03-09T00:00:00.000000000', '2019-03-09T17:10:00.000000000',
         '2019-03-09T01:35:00.000000000']], dtype='datetime64[ns]'),
 array([['2019-05-24T00:00:00.000000000', '2019-05-24T16:30:00.000000000',
         '2019-05-24T23:35:00.000000000'],
        ['2019-05-21T00:00:00.000000000', '2019-05-21T16:30:00.000000000',
         '2019-05-21T23:35:00.000000000'],
        ['2019-03

In [33]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [34]:
model.fit(X_train, y_train)

In [36]:
y_hat = model.predict(x_test)




UFuncTypeError: ufunc 'matmul' did not contain a loop with signature matching types (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>) -> None