In [2]:
import pandas as pd
import numpy as np
import json
from docx import Document
import os

In [2]:
def extract_text_from_docx(docx_path):
    """Extract all text from a Word document."""
    doc = Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    return '\n'.join(full_text)

def text_to_json(text):
    """Convert text to JSON format, assuming the text is valid JSON."""
    try:
        return json.loads(text)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

In [3]:
directory_path = r"C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data"
output_file_path = r"C:\Users\fatim\OneDrive\Desktop\semester 7\deep\assignment 1\aggregated.json"

In [4]:
all_json_data = []

for i in range(1, 73):
    if i == 38:
        continue  # Skip this iteration
    
    docx_path = os.path.join(directory_path, f'{i}.docx')
    
    if not os.path.isfile(docx_path):
        print(f"File not found: {docx_path}")
        continue  # Skip to the next iteration if the file does not exist

    print(f"Processing: {docx_path}")

    try:
        # Extract text from the Word document
        text = extract_text_from_docx(docx_path)

        # Convert extracted text to JSON
        json_data = text_to_json(text)

        # Check if JSON data was successfully created
        if json_data is not None:
            all_json_data.append(json_data)  # Append the JSON data to the list
        else:
            print(f"Failed to convert text to JSON for {docx_path}.")
    except Exception as e:
        print(f"An error occurred while processing {docx_path}: {e}")

# Write all JSON data to a single JSON file
with open(output_file_path, 'w') as f:
    json.dump(all_json_data, f, indent=4)

print(f"All data has been written to {output_file_path}.")


Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\1.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\2.docx


Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\3.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\4.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\5.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\6.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\7.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\8.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\9.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\10.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\11.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights Data\Flights Data\12.docx
Processing: C:\Users\fatim\OneDrive\Desktop\semester 7\deep\Flights

In [3]:
json_file_path = r"C:\Users\fatim\OneDrive\Desktop\semester 7\deep\assignment 1\aggregated.json"

if not os.path.isfile(json_file_path):
    print(f"File not found: {json_file_path}")
else:
    try:
        # Read JSON data from the file
        with open(json_file_path, 'r') as file:
            data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [4]:
df_list = []

# Loop over each JSON object
for json_obj in data:
    # Normalize the JSON data
    df = pd.DataFrame(json_obj)
    # Append the resulting DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames into one
df = pd.concat(df_list, ignore_index=True)

df

Unnamed: 0,type,status,departure,arrival,airline,flight,codeshared
0,departure,active,"{'iataCode': 'lhe', 'icaoCode': 'opla', 'termi...","{'iataCode': 'ist', 'icaoCode': 'ltfm', 'sched...","{'name': 'pakistan international airlines', 'i...","{'number': '5715', 'iataNumber': 'pk5715', 'ic...","{'airline': {'name': 'turkish airlines', 'iata..."
1,departure,active,"{'iataCode': 'lhe', 'icaoCode': 'opla', 'termi...","{'iataCode': 'ist', 'icaoCode': 'ltfm', 'sched...","{'name': 'turkish airlines', 'iataCode': 'tk',...","{'number': '715', 'iataNumber': 'tk715', 'icao...",
2,departure,active,"{'iataCode': 'lhe', 'icaoCode': 'opla', 'termi...","{'iataCode': 'jed', 'icaoCode': 'oejn', 'termi...","{'name': 'pakistan international airlines', 'i...","{'number': '859', 'iataNumber': 'pk859', 'icao...",
3,departure,active,"{'iataCode': 'lhe', 'icaoCode': 'opla', 'termi...","{'iataCode': 'cmb', 'icaoCode': 'vcbi', 'sched...","{'name': 'malaysia airlines', 'iataCode': 'mh'...","{'number': '9032', 'iataNumber': 'mh9032', 'ic...","{'airline': {'name': 'srilankan airlines', 'ia..."
4,departure,active,"{'iataCode': 'lhe', 'icaoCode': 'opla', 'termi...","{'iataCode': 'cmb', 'icaoCode': 'vcbi', 'sched...","{'name': 'srilankan airlines', 'iataCode': 'ul...","{'number': '154', 'iataNumber': 'ul154', 'icao...",
...,...,...,...,...,...,...,...
81387,departure,active,"{'iataCode': 'isb', 'icaoCode': 'opis', 'delay...","{'iataCode': 'doh', 'icaoCode': 'othh', 'bagga...","{'name': 'qatar airways', 'iataCode': 'qr', 'i...","{'number': '633', 'iataNumber': 'qr633', 'icao...",
81388,departure,active,"{'iataCode': 'isb', 'icaoCode': 'opis', 'delay...","{'iataCode': 'ruh', 'icaoCode': 'oerk', 'termi...","{'name': 'flynas', 'iataCode': 'xy', 'icaoCode...","{'number': '316', 'iataNumber': 'xy316', 'icao...",
81389,departure,active,"{'iataCode': 'isb', 'icaoCode': 'opis', 'delay...","{'iataCode': 'kwi', 'icaoCode': 'okkk', 'termi...","{'name': 'kuwait airways', 'iataCode': 'ku', '...","{'number': '206', 'iataNumber': 'ku206', 'icao...",
81390,departure,active,"{'iataCode': 'isb', 'icaoCode': 'opis', 'gate'...","{'iataCode': 'auh', 'icaoCode': 'omaa', 'termi...","{'name': 'klm', 'iataCode': 'kl', 'icaoCode': ...","{'number': '3930', 'iataNumber': 'kl3930', 'ic...","{'airline': {'name': 'etihad airways', 'iataCo..."


In [5]:
df['departure_iata'] = df['departure'].apply(lambda x: x.get('iataCode'))
df['departure_icao'] = df['departure'].apply(lambda x: x.get('icaoCode'))
df['departure_terminal'] = df['departure'].apply(lambda x: x.get('terminal'))
df['departure_delay'] = df['departure'].apply(lambda x: x.get('delay'))
df['departure_scheduledTime'] = df['departure'].apply(lambda x: x.get('scheduledTime'))
df['departure_estimatedTime'] = df['departure'].apply(lambda x: x.get('estimatedTime'))

In [6]:
df['arrival_iata'] = df['arrival'].apply(lambda x: x.get('iataCode'))
df['arrival_icao'] = df['arrival'].apply(lambda x: x.get('icaoCode'))
df['arrival_scheduledTime'] = df['arrival'].apply(lambda x: x.get('scheduledTime'))

In [7]:
df['airline_name'] = df['airline'].apply(lambda x: x.get('name'))
df['airline_iata'] = df['airline'].apply(lambda x: x.get('iataCode'))
df['airline_icao'] = df['airline'].apply(lambda x: x.get('icaoCode'))

df['flight_number'] = df['flight'].apply(lambda x: x.get('number'))
df['flight_iataNumber'] = df['flight'].apply(lambda x: x.get('iataNumber'))
df['flight_icaoNumber'] = df['flight'].apply(lambda x: x.get('icaoNumber'))

In [8]:
df['codeshared_airline'] = df['codeshared'].apply(lambda x: x['airline'].get('name') if pd.notna(x) else np.nan)
df['codeshared_iata'] = df['codeshared'].apply(lambda x: x['airline'].get('iataCode') if pd.notna(x) else np.nan)
df['codeshared_flight_number'] = df['codeshared'].apply(lambda x: x['flight'].get('number') if pd.notna(x) else np.nan)

In [9]:
df = df.drop(['departure', 'arrival', 'airline', 'flight', 'codeshared'], axis=1)

In [10]:
df

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,arrival_scheduledTime,airline_name,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number
0,departure,active,lhe,opla,m,10.0,2023-07-13t06:00:00.000,2023-07-13t06:10:00.000,ist,ltfm,2023-07-13t10:45:00.000,pakistan international airlines,pk,pia,5715,pk5715,pia5715,turkish airlines,tk,715
1,departure,active,lhe,opla,m,10.0,2023-07-13t06:00:00.000,2023-07-13t06:10:00.000,ist,ltfm,2023-07-13t10:45:00.000,turkish airlines,tk,thy,715,tk715,thy715,,,
2,departure,active,lhe,opla,m,20.0,2023-07-13t06:50:00.000,2023-07-13t07:10:00.000,jed,oejn,2023-07-13t10:10:00.000,pakistan international airlines,pk,pia,859,pk859,pia859,,,
3,departure,active,lhe,opla,m,,2023-07-13t07:05:00.000,2023-07-13t07:05:00.000,cmb,vcbi,2023-07-13t11:30:00.000,malaysia airlines,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,154
4,departure,active,lhe,opla,m,,2023-07-13t07:05:00.000,2023-07-13t07:05:00.000,cmb,vcbi,2023-07-13t11:30:00.000,srilankan airlines,ul,alk,154,ul154,alk154,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81387,departure,active,isb,opis,,17.0,2024-07-10t03:35:00.000,2024-07-10t03:35:00.000,doh,othh,2024-07-10t05:15:00.000,qatar airways,qr,qtr,633,qr633,qtr633,,,
81388,departure,active,isb,opis,,23.0,2024-07-10t04:15:00.000,2024-07-10t04:30:00.000,ruh,oerk,2024-07-10t06:30:00.000,flynas,xy,kne,316,xy316,kne316,,,
81389,departure,active,isb,opis,,4.0,2024-07-10t04:25:00.000,2024-07-10t04:35:00.000,kwi,okkk,2024-07-10t06:25:00.000,kuwait airways,ku,kac,206,ku206,kac206,,,
81390,departure,active,isb,opis,,37.0,2024-07-10t04:35:00.000,2024-07-10t05:12:00.000,auh,omaa,2024-07-10t07:00:00.000,klm,kl,klm,3930,kl3930,klm3930,etihad airways,ey,232


In [11]:
df['codeshared_airline'].fillna('No Codeshare', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['codeshared_airline'].fillna('No Codeshare', inplace=True)


In [12]:
df

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,arrival_scheduledTime,airline_name,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number
0,departure,active,lhe,opla,m,10.0,2023-07-13t06:00:00.000,2023-07-13t06:10:00.000,ist,ltfm,2023-07-13t10:45:00.000,pakistan international airlines,pk,pia,5715,pk5715,pia5715,turkish airlines,tk,715
1,departure,active,lhe,opla,m,10.0,2023-07-13t06:00:00.000,2023-07-13t06:10:00.000,ist,ltfm,2023-07-13t10:45:00.000,turkish airlines,tk,thy,715,tk715,thy715,No Codeshare,,
2,departure,active,lhe,opla,m,20.0,2023-07-13t06:50:00.000,2023-07-13t07:10:00.000,jed,oejn,2023-07-13t10:10:00.000,pakistan international airlines,pk,pia,859,pk859,pia859,No Codeshare,,
3,departure,active,lhe,opla,m,,2023-07-13t07:05:00.000,2023-07-13t07:05:00.000,cmb,vcbi,2023-07-13t11:30:00.000,malaysia airlines,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,154
4,departure,active,lhe,opla,m,,2023-07-13t07:05:00.000,2023-07-13t07:05:00.000,cmb,vcbi,2023-07-13t11:30:00.000,srilankan airlines,ul,alk,154,ul154,alk154,No Codeshare,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81387,departure,active,isb,opis,,17.0,2024-07-10t03:35:00.000,2024-07-10t03:35:00.000,doh,othh,2024-07-10t05:15:00.000,qatar airways,qr,qtr,633,qr633,qtr633,No Codeshare,,
81388,departure,active,isb,opis,,23.0,2024-07-10t04:15:00.000,2024-07-10t04:30:00.000,ruh,oerk,2024-07-10t06:30:00.000,flynas,xy,kne,316,xy316,kne316,No Codeshare,,
81389,departure,active,isb,opis,,4.0,2024-07-10t04:25:00.000,2024-07-10t04:35:00.000,kwi,okkk,2024-07-10t06:25:00.000,kuwait airways,ku,kac,206,ku206,kac206,No Codeshare,,
81390,departure,active,isb,opis,,37.0,2024-07-10t04:35:00.000,2024-07-10t05:12:00.000,auh,omaa,2024-07-10t07:00:00.000,klm,kl,klm,3930,kl3930,klm3930,etihad airways,ey,232


In [13]:
df['departure_scheduledTime'] = pd.to_datetime(df['departure_scheduledTime'])
df['departure_estimatedTime'] = pd.to_datetime(df['departure_estimatedTime'])
df['arrival_scheduledTime'] = pd.to_datetime(df['arrival_scheduledTime'])

In [14]:
df['departure_delay'].fillna(df['departure_delay'].mean(), inplace=True)
df['delay_time'] = (df['departure_estimatedTime'] - df['departure_scheduledTime']).dt.total_seconds() / 60

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['departure_delay'].fillna(df['departure_delay'].mean(), inplace=True)


In [15]:
df

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,...,airline_name,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number,delay_time
0,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,pakistan international airlines,pk,pia,5715,pk5715,pia5715,turkish airlines,tk,715,10.0
1,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,turkish airlines,tk,thy,715,tk715,thy715,No Codeshare,,,10.0
2,departure,active,lhe,opla,m,20.000000,2023-07-13 06:50:00,2023-07-13 07:10:00,jed,oejn,...,pakistan international airlines,pk,pia,859,pk859,pia859,No Codeshare,,,20.0
3,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,malaysia airlines,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,154,0.0
4,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,srilankan airlines,ul,alk,154,ul154,alk154,No Codeshare,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81387,departure,active,isb,opis,,17.000000,2024-07-10 03:35:00,2024-07-10 03:35:00,doh,othh,...,qatar airways,qr,qtr,633,qr633,qtr633,No Codeshare,,,0.0
81388,departure,active,isb,opis,,23.000000,2024-07-10 04:15:00,2024-07-10 04:30:00,ruh,oerk,...,flynas,xy,kne,316,xy316,kne316,No Codeshare,,,15.0
81389,departure,active,isb,opis,,4.000000,2024-07-10 04:25:00,2024-07-10 04:35:00,kwi,okkk,...,kuwait airways,ku,kac,206,ku206,kac206,No Codeshare,,,10.0
81390,departure,active,isb,opis,,37.000000,2024-07-10 04:35:00,2024-07-10 05:12:00,auh,omaa,...,klm,kl,klm,3930,kl3930,klm3930,etihad airways,ey,232,37.0


In [16]:
df_active = df[df['status'] == 'active'].copy()

In [17]:
df_active

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,...,airline_name,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number,delay_time
0,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,pakistan international airlines,pk,pia,5715,pk5715,pia5715,turkish airlines,tk,715,10.0
1,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,turkish airlines,tk,thy,715,tk715,thy715,No Codeshare,,,10.0
2,departure,active,lhe,opla,m,20.000000,2023-07-13 06:50:00,2023-07-13 07:10:00,jed,oejn,...,pakistan international airlines,pk,pia,859,pk859,pia859,No Codeshare,,,20.0
3,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,malaysia airlines,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,154,0.0
4,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,srilankan airlines,ul,alk,154,ul154,alk154,No Codeshare,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81387,departure,active,isb,opis,,17.000000,2024-07-10 03:35:00,2024-07-10 03:35:00,doh,othh,...,qatar airways,qr,qtr,633,qr633,qtr633,No Codeshare,,,0.0
81388,departure,active,isb,opis,,23.000000,2024-07-10 04:15:00,2024-07-10 04:30:00,ruh,oerk,...,flynas,xy,kne,316,xy316,kne316,No Codeshare,,,15.0
81389,departure,active,isb,opis,,4.000000,2024-07-10 04:25:00,2024-07-10 04:35:00,kwi,okkk,...,kuwait airways,ku,kac,206,ku206,kac206,No Codeshare,,,10.0
81390,departure,active,isb,opis,,37.000000,2024-07-10 04:35:00,2024-07-10 05:12:00,auh,omaa,...,klm,kl,klm,3930,kl3930,klm3930,etihad airways,ey,232,37.0


In [18]:
df_active

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,...,airline_name,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number,delay_time
0,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,pakistan international airlines,pk,pia,5715,pk5715,pia5715,turkish airlines,tk,715,10.0
1,departure,active,lhe,opla,m,10.000000,2023-07-13 06:00:00,2023-07-13 06:10:00,ist,ltfm,...,turkish airlines,tk,thy,715,tk715,thy715,No Codeshare,,,10.0
2,departure,active,lhe,opla,m,20.000000,2023-07-13 06:50:00,2023-07-13 07:10:00,jed,oejn,...,pakistan international airlines,pk,pia,859,pk859,pia859,No Codeshare,,,20.0
3,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,malaysia airlines,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,154,0.0
4,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,srilankan airlines,ul,alk,154,ul154,alk154,No Codeshare,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81387,departure,active,isb,opis,,17.000000,2024-07-10 03:35:00,2024-07-10 03:35:00,doh,othh,...,qatar airways,qr,qtr,633,qr633,qtr633,No Codeshare,,,0.0
81388,departure,active,isb,opis,,23.000000,2024-07-10 04:15:00,2024-07-10 04:30:00,ruh,oerk,...,flynas,xy,kne,316,xy316,kne316,No Codeshare,,,15.0
81389,departure,active,isb,opis,,4.000000,2024-07-10 04:25:00,2024-07-10 04:35:00,kwi,okkk,...,kuwait airways,ku,kac,206,ku206,kac206,No Codeshare,,,10.0
81390,departure,active,isb,opis,,37.000000,2024-07-10 04:35:00,2024-07-10 05:12:00,auh,omaa,...,klm,kl,klm,3930,kl3930,klm3930,etihad airways,ey,232,37.0


In [19]:
bin_edges = [-np.inf, 30, 60, 120, 240, 480, 720, 1440, np.inf]  # Adjusted to include 0 delay

# Bin the delay times into 8 categories
df_active['delay_time_bin'] = pd.cut(df['delay_time'], bins=bin_edges, labels=False)

# Step 4: Handle NaN values (for 0 delay_time)
# Assign 0 (first bin) to NaN values, since 0-minute delays should fall into the first bin
df_active['delay_time_bin'].fillna(0, inplace=True)

# Step 5: Verify the results
print(df_active[['departure_scheduledTime', 'departure_estimatedTime', 'delay_time', 'delay_time_bin']].head())

  departure_scheduledTime departure_estimatedTime  delay_time  delay_time_bin
0     2023-07-13 06:00:00     2023-07-13 06:10:00        10.0             0.0
1     2023-07-13 06:00:00     2023-07-13 06:10:00        10.0             0.0
2     2023-07-13 06:50:00     2023-07-13 07:10:00        20.0             0.0
3     2023-07-13 07:05:00     2023-07-13 07:05:00         0.0             0.0
4     2023-07-13 07:05:00     2023-07-13 07:05:00         0.0             0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_active['delay_time_bin'].fillna(0, inplace=True)


In [20]:
# Print the bin counts to check how many data points are in each bin
bin_counts = df_active['delay_time_bin'].value_counts().sort_index()
print(bin_counts)

delay_time_bin
0.0    68699
1.0     2693
2.0     1892
3.0     1478
4.0      802
5.0      243
6.0       89
7.0        1
Name: count, dtype: int64


In [21]:
df_active['departure_hour'] = df_active['departure_scheduledTime'].dt.hour
df_active['departure_day'] = df_active['departure_scheduledTime'].dt.day
df_active['arrival_hour'] = df_active['arrival_scheduledTime'].dt.hour
df_active['arrival_day'] = df_active['arrival_scheduledTime'].dt.day

In [22]:
# resampling as the data is highly skewed
from sklearn.utils import resample
df_majority = df_active[df_active['delay_time'] == 0]
df_minority = df_active[df_active['delay_time'] != 0]


In [23]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,  # Sample with replacement
                                 n_samples=len(df_majority),  # Match number of majority samples
                                 random_state=42)  # Set seed for reproducibility

In [24]:
df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [25]:
df_balanced

Unnamed: 0,type,status,departure_iata,departure_icao,departure_terminal,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_iata,arrival_icao,...,flight_icaoNumber,codeshared_airline,codeshared_iata,codeshared_flight_number,delay_time,delay_time_bin,departure_hour,departure_day,arrival_hour,arrival_day
3,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,mas9032,srilankan airlines,ul,154,0.0,0.0,7,13,11,13
4,departure,active,lhe,opla,m,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,cmb,vcbi,...,alk154,No Codeshare,,,0.0,0.0,7,13,11,13
6,departure,active,lhe,opla,m,34.134127,2023-07-13 08:00:00,2023-07-13 08:00:00,kdu,opsd,...,pia6453,No Codeshare,,,0.0,0.0,8,13,9,13
7,departure,active,lhe,opla,m,10.000000,2023-07-13 09:00:00,2023-07-13 09:00:00,khi,opkc,...,abq401,No Codeshare,,,0.0,0.0,9,13,10,13
8,departure,active,lhe,opla,m,34.134127,2023-07-13 09:25:00,2023-07-13 09:25:00,doh,othh,...,ibe7915,qatar airways,qr,629,0.0,0.0,9,13,11,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7434,departure,active,lhe,opla,m,34.134127,2023-11-01 08:35:00,NaT,doh,othh,...,jbu5586,qatar airways,qr,629,,0.0,8,1,10,1
43256,departure,active,khi,opkc,m,18.000000,2024-02-25 13:15:00,2024-02-25 13:25:00,jed,oejn,...,kne638,No Codeshare,,,10.0,0.0,13,25,16,25
32969,departure,active,khi,opkc,m,2.000000,2023-10-17 17:50:00,2023-10-17 17:41:00,cmb,vcbi,...,alk184,No Codeshare,,,-9.0,0.0,17,17,22,17
70669,departure,active,isb,opis,,10.000000,2024-02-07 03:10:00,2024-02-07 03:20:00,doh,othh,...,rwd1572,qatar airways,qr,633,10.0,0.0,3,7,5,7


In [26]:
bin_counts = df_balanced['delay_time_bin'].value_counts().sort_index()
print(bin_counts)

delay_time_bin
0.0    81973
1.0     4401
2.0     3047
3.0     2404
4.0     1254
5.0      369
6.0      156
Name: count, dtype: int64


In [27]:
df_balanced.columns


Index(['type', 'status', 'departure_iata', 'departure_icao',
       'departure_terminal', 'departure_delay', 'departure_scheduledTime',
       'departure_estimatedTime', 'arrival_iata', 'arrival_icao',
       'arrival_scheduledTime', 'airline_name', 'airline_iata', 'airline_icao',
       'flight_number', 'flight_iataNumber', 'flight_icaoNumber',
       'codeshared_airline', 'codeshared_iata', 'codeshared_flight_number',
       'delay_time', 'delay_time_bin', 'departure_hour', 'departure_day',
       'arrival_hour', 'arrival_day'],
      dtype='object')

In [109]:
# categorical_cols = [
#     'arrival_iata', 'arrival_icao', 
#     'airline_name', 'departure_iata', 
#     'departure_icao', 'departure_terminal'
# ]

# # Apply one-hot encoding
# df_encoded = pd.get_dummies(df_balanced, columns=categorical_cols, drop_first=True)

# df_encoded

Unnamed: 0,type,status,departure_delay,departure_scheduledTime,departure_estimatedTime,arrival_scheduledTime,airline_iata,airline_icao,flight_number,flight_iataNumber,...,airline_name_yto cargo airlines,departure_iata_khi,departure_iata_lhe,departure_icao_opkc,departure_icao_opla,departure_terminal_h,departure_terminal_i,departure_terminal_m,departure_terminal_main,departure_terminal_t1
3,departure,active,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,2023-07-13 11:30:00,mh,mas,9032,mh9032,...,False,False,True,False,True,False,False,True,False,False
4,departure,active,34.134127,2023-07-13 07:05:00,2023-07-13 07:05:00,2023-07-13 11:30:00,ul,alk,154,ul154,...,False,False,True,False,True,False,False,True,False,False
6,departure,active,34.134127,2023-07-13 08:00:00,2023-07-13 08:00:00,2023-07-13 09:15:00,pk,pia,6453,pk6453,...,False,False,True,False,True,False,False,True,False,False
7,departure,active,10.000000,2023-07-13 09:00:00,2023-07-13 09:00:00,2023-07-13 10:55:00,pa,abq,401,pa401,...,False,False,True,False,True,False,False,True,False,False
8,departure,active,34.134127,2023-07-13 09:25:00,2023-07-13 09:25:00,2023-07-13 11:25:00,ib,ibe,7915,ib7915,...,False,False,True,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7434,departure,active,34.134127,2023-11-01 08:35:00,NaT,2023-11-01 10:45:00,b6,jbu,5586,b65586,...,False,False,True,False,True,False,False,True,False,False
43256,departure,active,18.000000,2024-02-25 13:15:00,2024-02-25 13:25:00,2024-02-25 16:15:00,xy,kne,638,xy638,...,False,True,False,True,False,False,False,True,False,False
32969,departure,active,2.000000,2023-10-17 17:50:00,2023-10-17 17:41:00,2023-10-17 22:00:00,ul,alk,184,ul184,...,False,True,False,True,False,False,False,True,False,False
70669,departure,active,10.000000,2024-02-07 03:10:00,2024-02-07 03:20:00,2024-02-07 05:20:00,wb,rwd,1572,wb1572,...,False,False,False,False,False,False,False,False,False,False


In [125]:
df_balanced

Unnamed: 0,type,status,departure_delay,airline_iata,airline_icao,flight_number,flight_iataNumber,flight_icaoNumber,codeshared_airline,codeshared_iata,...,departure_scheduled_month,departure_scheduled_year,departure_estimated_hour,departure_estimated_day,departure_estimated_month,departure_estimated_year,arrival_scheduled_hour,arrival_scheduled_day,arrival_scheduled_month,arrival_scheduled_year
3,departure,active,34.134127,mh,mas,9032,mh9032,mas9032,srilankan airlines,ul,...,7,2023,7.0,13.0,7.0,2023.0,11,13,7,2023
4,departure,active,34.134127,ul,alk,154,ul154,alk154,No Codeshare,,...,7,2023,7.0,13.0,7.0,2023.0,11,13,7,2023
6,departure,active,34.134127,pk,pia,6453,pk6453,pia6453,No Codeshare,,...,7,2023,8.0,13.0,7.0,2023.0,9,13,7,2023
7,departure,active,10.000000,pa,abq,401,pa401,abq401,No Codeshare,,...,7,2023,9.0,13.0,7.0,2023.0,10,13,7,2023
8,departure,active,34.134127,ib,ibe,7915,ib7915,ibe7915,qatar airways,qr,...,7,2023,9.0,13.0,7.0,2023.0,11,13,7,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7434,departure,active,34.134127,b6,jbu,5586,b65586,jbu5586,qatar airways,qr,...,11,2023,,,,,10,1,11,2023
43256,departure,active,18.000000,xy,kne,638,xy638,kne638,No Codeshare,,...,2,2024,13.0,25.0,2.0,2024.0,16,25,2,2024
32969,departure,active,2.000000,ul,alk,184,ul184,alk184,No Codeshare,,...,10,2023,17.0,17.0,10.0,2023.0,22,17,10,2023
70669,departure,active,10.000000,wb,rwd,1572,wb1572,rwd1572,qatar airways,qr,...,2,2024,3.0,7.0,2.0,2024.0,5,7,2,2024


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix

def preprocess_and_encode_data(df):
    # Convert timestamp columns to datetime objects if not already
    df['departure_scheduledTime'] = pd.to_datetime(df['departure_scheduledTime'])
    df['departure_estimatedTime'] = pd.to_datetime(df['departure_estimatedTime'])
    df['arrival_scheduledTime'] = pd.to_datetime(df['arrival_scheduledTime'])
    
    # Extract features from datetime columns
    df['departure_scheduled_hour'] = df['departure_scheduledTime'].dt.hour
    df['departure_scheduled_day'] = df['departure_scheduledTime'].dt.day
    df['departure_scheduled_month'] = df['departure_scheduledTime'].dt.month
    df['departure_scheduled_year'] = df['departure_scheduledTime'].dt.year
    
    df['departure_estimated_hour'] = df['departure_estimatedTime'].dt.hour
    df['departure_estimated_day'] = df['departure_estimatedTime'].dt.day
    df['departure_estimated_month'] = df['departure_estimatedTime'].dt.month
    df['departure_estimated_year'] = df['departure_estimatedTime'].dt.year
    
    df['arrival_scheduled_hour'] = df['arrival_scheduledTime'].dt.hour
    df['arrival_scheduled_day'] = df['arrival_scheduledTime'].dt.day
    df['arrival_scheduled_month'] = df['arrival_scheduledTime'].dt.month
    df['arrival_scheduled_year'] = df['arrival_scheduledTime'].dt.year

    # Drop original timestamp columns
    df = df.drop(columns=['departure_scheduledTime', 'departure_estimatedTime', 'arrival_scheduledTime'])
    
    # Define categorical and numerical features
    categorical_features = ['departure_iata', 'departure_icao', 'departure_terminal', 'airline_name']
    numerical_features = ['departure_delay', 'departure_hour', 'departure_day', 'arrival_hour', 'arrival_day',
                          'departure_scheduled_hour', 'departure_scheduled_day', 'departure_scheduled_month',
                          'departure_scheduled_year', 'departure_estimated_hour', 'departure_estimated_day',
                          'departure_estimated_month', 'departure_estimated_year', 'arrival_scheduled_hour',
                          'arrival_scheduled_day', 'arrival_scheduled_month', 'arrival_scheduled_year']
    
    # Apply one-hot encoding to categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    X = preprocessor.fit_transform(df)
    if isinstance(X, csr_matrix):
        X = X.toarray()
    y = df['delay_time_bin'].values  # Assuming delay_time_bin is the target
    
    return X, y

# Preprocess and encode the data
X, y = preprocess_and_encode_data(df_balanced)
print(X.shape)
print(y.shape)

(93604, 122)
(93604,)


In [46]:
import numpy as np

class SVM:
    def __init__(self, kernel='linear', C=1.0):
        self.kernel_name = kernel
        self.C = C
        self.alpha = None
        self.b = 0
        self.X_train = None
        self.y_train = None
        self.n_samples = 0

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.n_samples = X.shape[0]
        K = self._compute_kernel_matrix()
        
        # Initialize alpha values
        self.alpha = np.zeros(self.n_samples)
        
        # Optimization process to find alpha (simplified, replace with actual implementation)
        # For debugging, we'll just use a dummy update
        self.alpha = np.random.rand(self.n_samples)
        
        # Calculate bias
        self.b = np.mean(y - np.dot(self.alpha, K.T))
        print(f'Alpha: {self.alpha}')
        print(f'Bias (b): {self.b}')
    
    def _compute_kernel_matrix(self):
        if self.kernel_name == 'linear':
            K = np.dot(self.X_train, self.X_train.T)
        elif self.kernel_name == 'polynomial':
            # Example polynomial kernel, replace with actual polynomial kernel
            K = (np.dot(self.X_train, self.X_train.T) + 1) ** 2
        elif self.kernel_name == 'rbf':
            # Example RBF kernel, replace with actual RBF kernel
            gamma = 0.1
            K = np.exp(-gamma * np.sum((self.X_train[:, np.newaxis] - self.X_train) ** 2, axis=2))
        else:
            raise ValueError("Unsupported kernel")
        
        print(f'Kernel matrix shape: {K.shape}')
        return K

    def _decision_function(self, X):
        K = self._compute_kernel_matrix_for_predict(X)
        print(f'Kernel matrix for decision function shape: {K.shape}')
        
        decision_values = np.dot((self.alpha * self.y_train), K.T) + self.b
        print(f'Decision values shape: {decision_values.shape}')
        return decision_values

    def _compute_kernel_matrix_for_predict(self, X):
        if self.kernel_name == 'linear':
            K = np.dot(X, self.X_train.T)
        elif self.kernel_name == 'polynomial':
            # Example polynomial kernel, replace with actual polynomial kernel
            K = (np.dot(X, self.X_train.T) + 1) ** 2
        elif self.kernel_name == 'rbf':
            # Example RBF kernel, replace with actual RBF kernel
            gamma = 0.1
            K = np.exp(-gamma * np.sum((X[:, np.newaxis] - self.X_train) ** 2, axis=2))
        else:
            raise ValueError("Unsupported kernel")
        
        print(f'Kernel matrix for predict shape: {K.shape}')
        return K

    def predict(self, X):
        decision_values = self._decision_function(X)
        return np.sign(decision_values)

# Example usage with debug prints
if __name__ == "__main__":
    from sklearn.datasets import make_classification

    # Generating dataset with correct parameters
    X, y = make_classification(
        n_samples=100,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        random_state=42
    )
    
    model = SVM(kernel='linear')
    model.fit(X, y)
    predictions = model.predict(X)
    print(f'Predictions: {predictions}')


Kernel matrix shape: (100, 100)
Alpha: [0.35260035 0.01518578 0.73135007 0.52146898 0.87624438 0.76477462
 0.80837215 0.96326766 0.11334645 0.22743757 0.74682885 0.25700062
 0.81332201 0.39653001 0.49641285 0.95538455 0.07538781 0.73468358
 0.08348436 0.17225161 0.3123429  0.7185369  0.9408897  0.586096
 0.14574158 0.55476489 0.62739576 0.03590247 0.08683068 0.95743059
 0.05016806 0.61053334 0.43455708 0.12021449 0.86769997 0.96884495
 0.77329941 0.21726792 0.55697675 0.34485312 0.05709317 0.05111113
 0.07395709 0.37700632 0.02242246 0.15383402 0.70135647 0.55688257
 0.58639439 0.85160987 0.58558852 0.05005465 0.35114257 0.19778632
 0.32550115 0.33715116 0.15301783 0.50340211 0.40677183 0.08361133
 0.49836233 0.12273928 0.23728122 0.70480756 0.57579962 0.61828594
 0.32651593 0.15843299 0.75149542 0.30297025 0.15940523 0.02294709
 0.73539947 0.76816363 0.52630506 0.77017224 0.16083187 0.66478945
 0.66854252 0.78880487 0.85344826 0.79236036 0.62235573 0.03513934
 0.59424527 0.35935666 0.

In [49]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.tree = self._build_tree(np.arange(X.shape[0]), 0)
    
    def _build_tree(self, indices, depth):
        """ Recursively build the decision tree """
        if len(set(self.y_train[indices])) == 1:
            return self.y_train[indices][0]
        
        if self.max_depth is not None and depth >= self.max_depth:
            return self._majority_vote(indices)
        
        best_feature, best_threshold = self._best_split(indices)
        if best_feature is None:
            return self._majority_vote(indices)
        
        left_indices = indices[self.X_train[indices, best_feature] <= best_threshold]
        right_indices = indices[self.X_train[indices, best_feature] > best_threshold]
        
        left_tree = self._build_tree(left_indices, depth + 1)
        right_tree = self._build_tree(right_indices, depth + 1)
        
        return (best_feature, best_threshold, left_tree, right_tree)
    
    def _best_split(self, indices):
        """ Find the best feature and threshold to split on """
        best_feature = None
        best_threshold = None
        best_score = -float('inf')
        
        for feature in range(self.X_train.shape[1]):
            thresholds = np.unique(self.X_train[indices, feature])
            for threshold in thresholds:
                left_indices = indices[self.X_train[indices, feature] <= threshold]
                right_indices = indices[self.X_train[indices, feature] > threshold]
                
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                
                score = self._information_gain(indices, left_indices, right_indices)
                if score > best_score:
                    best_feature = feature
                    best_threshold = threshold
                    best_score = score
        
        return best_feature, best_threshold
    
    def _information_gain(self, parent_indices, left_indices, right_indices):
        """ Compute the information gain from a split """
        parent_entropy = self._entropy(parent_indices)
        left_entropy = self._entropy(left_indices)
        right_entropy = self._entropy(right_indices)
        
        p_left = len(left_indices) / len(parent_indices)
        p_right = len(right_indices) / len(parent_indices)
        
        return parent_entropy - (p_left * left_entropy + p_right * right_entropy)
    
    def _entropy(self, indices):
        """ Compute the entropy of a set of indices """
        labels, counts = np.unique(self.y_train[indices], return_counts=True)
        probabilities = counts / len(indices)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))
    
    def _majority_vote(self, indices):
        """ Return the most common label """
        labels, counts = np.unique(self.y_train[indices], return_counts=True)
        return labels[np.argmax(counts)]
    
    def predict(self, X):
        """ Predict class labels for samples """
        return np.array([self._predict_single(x) for x in X])
    
    def _predict_single(self, x):
        """ Predict the class for a single sample """
        node = self.tree
        while isinstance(node, tuple):
            feature, threshold, left_tree, right_tree = node
            if x[feature] <= threshold:
                node = left_tree
            else:
                node = right_tree
        return node


In [50]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.m, self.n = X.shape
        self.theta = np.zeros(self.n)
        self._gradient_descent()

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _gradient_descent(self):
        for _ in range(self.num_iterations):
            predictions = self._sigmoid(np.dot(self.X_train, self.theta))
            errors = predictions - self.y_train
            gradient = np.dot(self.X_train.T, errors) / self.m
            self.theta -= self.learning_rate * gradient
    
    def predict(self, X):
        probabilities = self._sigmoid(np.dot(X, self.theta))
        return np.where(probabilities >= 0.5, 1, 0)


In [51]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize models
svm_model = SVM(kernel='linear')
dt_model = DecisionTree(max_depth=3)
logistic_model = LogisticRegression()

# Function to perform cross-validation
def cross_validate(model, X, y):
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
    return scores

# Evaluate each model
svm_scores = cross_validate(svm_model, X, y)
dt_scores = cross_validate(dt_model, X, y)
logistic_scores = cross_validate(logistic_model, X, y)

print(f'SVM Cross-Validation Scores: {svm_scores}')
print(f'Decision Tree Cross-Validation Scores: {dt_scores}')
print(f'Logistic Regression Cross-Validation Scores: {logistic_scores}')
print(f'SVM Average Accuracy: {np.mean(svm_scores)}')
print(f'Decision Tree Average Accuracy: {np.mean(dt_scores)}')
print(f'Logistic Regression Average Accuracy: {np.mean(logistic_scores)}')

Kernel matrix shape: (80, 80)
Alpha: [0.01022548 0.75742951 0.52638824 0.18980576 0.6258888  0.92217508
 0.72339315 0.63921231 0.73573932 0.97000782 0.18986103 0.26119772
 0.76210815 0.14252882 0.0211281  0.03294387 0.12529467 0.40367426
 0.89428979 0.84801523 0.85743049 0.32845632 0.90057014 0.7247638
 0.39287695 0.09429969 0.37574302 0.07075759 0.10161598 0.86382711
 0.6733297  0.60731312 0.06083627 0.77862044 0.6288798  0.04798687
 0.53069457 0.76422253 0.89141683 0.52387641 0.52467062 0.42529503
 0.81355109 0.30236466 0.42831708 0.59252287 0.89612725 0.60497378
 0.77724779 0.8874452  0.1698236  0.91670129 0.38751537 0.21702225
 0.00874195 0.75596702 0.48544706 0.23697548 0.03525398 0.81580853
 0.94125266 0.77045604 0.53208215 0.00184703 0.65467581 0.53529102
 0.65562716 0.79372255 0.91588354 0.0394536  0.3870135  0.4951203
 0.71459178 0.36107194 0.16784248 0.27782153 0.27780446 0.87712709
 0.66488832 0.63701521]
Bias (b): 0.44254080256309225
Kernel matrix for predict shape: (20, 80

In [58]:
class Bagging:
    def __init__(self, base_model_class, n_estimators=10, **base_model_kwargs):
        self.base_model_class = base_model_class
        self.n_estimators = n_estimators
        self.base_model_kwargs = base_model_kwargs
        self.models = []

    def fit(self, X, y):
        self.models = []
        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap, y_bootstrap = X[indices], y[indices]
            
            model = self.base_model_class(**self.base_model_kwargs)
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)
    
    def predict(self, X):
        # Collect predictions from each model
        predictions = np.zeros((X.shape[0], self.n_estimators))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        
        # Perform majority voting
        # Convert predictions to integers (assuming binary classification: -1 or 1)
        majority_votes = np.array([np.bincount(row.astype(int) + 1).argmax() - 1 for row in predictions])
        return majority_votes


In [59]:
# Create Bagging models for each base model with appropriate parameters
bagging_svm = Bagging(base_model_class=SVM, n_estimators=10, kernel='linear')
bagging_dt = Bagging(base_model_class=DecisionTree, n_estimators=10, max_depth=3)
bagging_logistic = Bagging(base_model_class=LogisticRegression, n_estimators=10, learning_rate=0.01, num_iterations=1000)

# Fit each model
bagging_svm.fit(X, y)
bagging_dt.fit(X, y)
bagging_logistic.fit(X, y)

# Collect predictions from each model
svm_predictions = bagging_svm.predict(X)
dt_predictions = bagging_dt.predict(X)
logistic_predictions = bagging_logistic.predict(X)

# Stack predictions for majority voting
stacked_predictions = np.stack([svm_predictions, dt_predictions, logistic_predictions], axis=1)

# Perform majority voting
final_predictions = np.array([np.bincount(row.astype(int) + 1).argmax() - 1 for row in stacked_predictions])

# Calculate accuracy
final_accuracy = accuracy_score(y, final_predictions)
print(f'Final Accuracy with Bagging and Majority Voting: {final_accuracy}')


Kernel matrix shape: (100, 100)
Alpha: [0.23110147 0.92053329 0.91049259 0.58881542 0.16948962 0.59904341
 0.76125204 0.2108592  0.75242241 0.46175828 0.06044441 0.87935118
 0.36305797 0.25244012 0.71760134 0.69250077 0.50768912 0.07194543
 0.1732385  0.88080528 0.1686562  0.16200556 0.45206397 0.35155065
 0.45484116 0.36776702 0.5044781  0.67681274 0.91748797 0.02310565
 0.87714401 0.35530799 0.9546889  0.79779477 0.75710936 0.47681019
 0.87370677 0.22625249 0.76961708 0.6872328  0.6307632  0.14629853
 0.11156968 0.29663087 0.22912714 0.65443202 0.32373906 0.05699055
 0.92614455 0.88031972 0.51266952 0.37704174 0.52604625 0.72767372
 0.28616768 0.64599995 0.69038758 0.80427977 0.77621602 0.06897441
 0.17331294 0.30467238 0.06808912 0.15748431 0.09458673 0.77054457
 0.85721475 0.89817876 0.80430915 0.95985607 0.50956154 0.67522544
 0.53797285 0.25530781 0.18686925 0.10236191 0.41698218 0.77878324
 0.44232586 0.29310048 0.02839463 0.13897989 0.44738106 0.42721459
 0.92533242 0.75542127 