# Testing Single doc file conversion

In [3]:
import json
import pandas as pd
from docx import Document

# Function to extract JSON text from a .docx file
def extract_json_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    json_str = '\n'.join(full_text)
    return json_str

# Function to convert JSON string to pandas DataFrame
def json_to_dataframe(json_str):
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        return None
    df = pd.json_normalize(data)
    return df

# Specify the path to your .docx file
docx_file_path = 'ML-Proj-Dataset/Train/15.docx'

# Extract JSON string from the .docx file
json_data = extract_json_from_docx(docx_file_path)

# Convert JSON string to DataFrame
df = json_to_dataframe(json_data)

# Display the DataFrame
print(df.head())


        type  status departure.iataCode departure.icaoCode departure.terminal  \
0  departure  active                lhe               opla                  m   
1  departure  active                lhe               opla                  m   
2  departure  active                lhe               opla                  m   
3  departure  active                lhe               opla                  m   
4  departure  active                lhe               opla                  m   

   departure.scheduledTime     departure.actualTime departure.estimatedRunway  \
0  2024-02-29t13:00:00.000  2024-02-29t13:20:00.000   2024-02-29t13:20:00.000   
1  2024-02-26t23:20:00.000  2024-02-26t23:20:00.000   2024-02-26t23:20:00.000   
2  2024-02-19t11:40:00.000                      NaN                       NaN   
3  2024-02-21t02:55:00.000                      NaN                       NaN   
4  2024-02-17t08:35:00.000  2024-02-17t08:43:00.000   2024-02-17t08:43:00.000   

    departure.actualRunway

In [7]:
 docx_folder = 'ML-Proj-Dataset/Train/'  # Update this with the correct folder path

In [9]:
import json
import pandas as pd
from docx import Document

def extract_json_from_docx(file_path):
    """
    Extracts JSON string from a .docx file.
    
    Parameters:
        file_path (str): Path to the .docx file.
        
    Returns:
        str: JSON string extracted from the document.
    """
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    json_str = '\n'.join(full_text).strip()
    return json_str

def json_to_dataframe(json_str):
    """
    Converts a JSON string to a pandas DataFrame.
    
    Parameters:
        json_str (str): JSON string to be converted.
        
    Returns:
        pd.DataFrame: DataFrame containing the JSON data.
    """
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        return None
    df = pd.json_normalize(data, sep='.')
    return df

# Specify the path to your single .docx file for initial testing
docx_file_path = 'ML-Proj-Dataset/Train/1.docx'  # Replace with your file path

# Extract JSON string from the .docx file
json_data = extract_json_from_docx(docx_file_path)

# Convert JSON string to DataFrame
df = json_to_dataframe(json_data)

# Display the DataFrame
print("DataFrame Head:")
print(df.head())

# Display DataFrame Info
print("\nDataFrame Info:")
print(df.info())

# Save the DataFrame to a CSV file for verification (optional)
df.to_csv('single_file_converted.csv', index=False)
print("\nConverted DataFrame saved as 'single_file_converted.csv'.")


DataFrame Head:
        type  status departure.iataCode departure.icaoCode departure.terminal  \
0  departure  active                lhe               opla                  m   
1  departure  active                lhe               opla                NaN   
2  departure  active                lhe               opla                NaN   
3  departure  active                lhe               opla                  m   
4  departure  active                lhe               opla                  m   

   departure.scheduledTime  departure.estimatedTime     departure.actualTime  \
0  2023-07-20t20:50:00.000  2023-07-20t20:00:00.000  2023-07-20t20:15:00.000   
1  2023-07-18t15:05:00.000  2023-07-18t15:05:00.000                      NaN   
2  2023-07-23t09:50:00.000                      NaN                      NaN   
3  2023-07-26t23:30:00.000  2023-07-26t23:30:00.000  2023-07-26t23:51:00.000   
4  2023-07-20t11:35:00.000  2023-07-20t17:15:00.000                      NaN   

  departure.esti

Unnamed: 0,type,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,...,flight.icaoNumber,arrival.baggage,codeshared.airline.name,codeshared.airline.iataCode,codeshared.airline.icaoCode,codeshared.flight.number,codeshared.flight.iataNumber,codeshared.flight.icaoNumber,arrival.gate,departure.gate
0,departure,active,lhe,opla,m,2023-07-20t20:50:00.000,2023-07-20t20:00:00.000,2023-07-20t20:15:00.000,2023-07-20t20:15:00.000,2023-07-20t20:15:00.000,...,sva737,,,,,,,,,
1,departure,active,lhe,opla,,2023-07-18t15:05:00.000,2023-07-18t15:05:00.000,,,,...,fjl843,,,,,,,,,
2,departure,active,lhe,opla,,2023-07-23t09:50:00.000,,,,,...,fjl841,,,,,,,,,
3,departure,active,lhe,opla,m,2023-07-26t23:30:00.000,2023-07-26t23:30:00.000,2023-07-26t23:51:00.000,2023-07-26t23:51:00.000,2023-07-26t23:51:00.000,...,pia205,,,,,,,,,
4,departure,active,lhe,opla,m,2023-07-20t11:35:00.000,2023-07-20t17:15:00.000,,,,...,sep723,03,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,departure,active,lhe,opla,m,2023-07-28t08:40:00.000,2023-07-28t08:40:00.000,,,,...,pia189,,,,,,,,,
813,departure,active,lhe,opla,m,2023-07-15t03:30:00.000,2023-07-15t03:30:00.000,,,,...,aal8284,2,qatar airways,qr,qtr,621,qr621,qtr621,,
814,departure,active,lhe,opla,,2023-07-19t21:35:00.000,2023-07-19t21:35:00.000,2023-07-19t22:11:00.000,2023-07-19t22:11:00.000,2023-07-19t22:11:00.000,...,vsv850,1,,,,,,,,
815,departure,active,lhe,opla,m,2023-07-15t08:00:00.000,2023-07-15t08:00:00.000,2023-07-15t08:25:00.000,2023-07-15t08:25:00.000,2023-07-15t08:25:00.000,...,pia453,,,,,,,,,


In [27]:
df['codeshared'].unique()

array([nan, 'qtr629', 'qtr609', 'qtr621', 'uae623', 'pia707', 'thy715',
       'etd242', 'uae625', 'gfa765', 'etd244', 'pia203', 'tha346',
       'alk154', 'gfa767'], dtype=object)

# converting the training dataset 

In [37]:
import json
import pandas as pd
from docx import Document
import os
import re
from sklearn.preprocessing import LabelEncoder

def extract_json_from_docx(file_path):
    """
    Extracts JSON string from a .docx file.
    Assumes that the JSON content is enclosed within [ ] brackets.
    """
    doc = Document(file_path)
    full_text = []
    json_started = False
    json_content = ""

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue  # Skip empty paragraphs

        # Detect the start of JSON array
        if text.startswith('['):
            json_started = True

        if json_started:
            full_text.append(text)
            # Detect the end of JSON array
            if text.endswith(']'):
                break

    json_str = '\n'.join(full_text)
    
    # Optional: Clean up the JSON string using regex if necessary
    json_str = re.sub(r'(?<!\\)"', r'"', json_str)  # Replace unescaped quotes if necessary
    
    return json_str

def parse_json_to_dataframe(json_str):
    """
    Parses a JSON string to a pandas DataFrame.
    Handles both single JSON array and multiple JSON objects.
    """
    try:
        # Attempt to load the JSON string as a list
        data = json.loads(json_str)
        if isinstance(data, list):
            df = pd.json_normalize(data, sep='.')
            return df
        else:
            # If not a list, wrap it into a list
            df = pd.json_normalize([data], sep='.')
            return df
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return None

def process_docx_files(directory_path):
    """
    Processes all .docx files in the specified directory.
    Returns a combined pandas DataFrame.
    """
    all_dfs = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {filename}")
            json_str = extract_json_from_docx(file_path)
            df = parse_json_to_dataframe(json_str)
            if df is not None:
                all_dfs.append(df)
            else:
                print(f"Failed to parse JSON in file: {filename}")
    
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        return combined_df
    else:
        print("No DataFrames to concatenate.")
        return pd.DataFrame()

def handle_missing_values(df):
    """
    Handles missing values in the DataFrame.
    Drops columns with >50% missing values and imputes others.
    """
    # Calculate missing values percentage
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage.sort_values(ascending=False)
    
    # Identify columns to drop
    cols_to_drop = missing_percentage[missing_percentage > 50].index.tolist()
    print("Columns to Drop:", cols_to_drop)
    
    # Drop these columns
    df.drop(columns=cols_to_drop, inplace=True)
    
    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Impute numerical columns with median
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_value = df[col].median()
            df[col].fillna(median_value, inplace=True)
            print(f"Imputed missing values in numerical column '{col}' with median value {median_value}.")
    
    # Impute categorical columns with 'Unknown'
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna('Unknown', inplace=True)
            print(f"Imputed missing values in categorical column '{col}' with 'Unknown'.")
    
    return df

def format_datetime_columns(df, datetime_cols):
    """
    Converts specified columns to datetime format.
    """
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
            print(f"Converted '{col}' to datetime.")
    return df

def calculate_departure_delay(df):
    """
    Calculates departure delay in minutes.
    """
    # Calculate departure delay
    df['departure.delay_minutes'] = (df['departure.actualTime'] - df['departure.scheduledTime']).dt.total_seconds() / 60

    # Handle any remaining missing delays
    df['departure.delay_minutes'].fillna(0, inplace=True)  # Assuming no delay if actual time is missing
    print("Calculated 'departure.delay_minutes'.")
    return df

def extract_temporal_features(df):
    """
    Extracts temporal features from scheduled departure time.
    """
    # Extract day of the week
    df['departure.day_of_week'] = df['departure.scheduledTime'].dt.day_name()

    # Extract hour of the day
    df['departure.hour_of_day'] = df['departure.scheduledTime'].dt.hour

    # Extract month of the year
    df['departure.month'] = df['departure.scheduledTime'].dt.month

    print("Extracted temporal features.")
    return df

def encode_categorical_variables(df):
    """
    Encodes categorical variables into numerical formats.
    """
    # Initialize LabelEncoder
    le = LabelEncoder()

    # Encode 'status' column (binary classification example)
    if 'status' in df.columns:
        df['status_encoded'] = le.fit_transform(df['status'])
        print("Encoded 'status' column.")

    # One-Hot Encode 'departure.day_of_week'
    df = pd.get_dummies(df, columns=['departure.day_of_week'], drop_first=True)
    print("One-Hot Encoded 'departure.day_of_week'.")

    # One-Hot Encode 'airline.name'
    if 'airline.name' in df.columns:
        df = pd.get_dummies(df, columns=['airline.name'], drop_first=True)
        print("One-Hot Encoded 'airline.name'.")
    
    return df

def drop_irrelevant_columns(df):
    """
    Drops irrelevant or redundant columns.
    """
    # Example: Drop 'type' column if all records are 'departure'
    if 'type' in df.columns:
        df.drop(columns=['type'], inplace=True)
        print("Dropped 'type' column.")

    # Drop 'arrival.iataCode' and 'arrival.icaoCode' if not needed
    columns_to_drop = ['arrival.iataCode', 'arrival.icaoCode']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print(f"Dropped columns: {columns_to_drop}")
    
    return df

def main():
    # Specify the directory containing your .docx files
    directory_path = 'ML-Proj-Dataset/Train/'  # Replace with your actual path

    # Process the .docx files and get the combined DataFrame
    combined_df = process_docx_files(directory_path)

    if combined_df.empty:
        print("No data to process.")
        return

    print(f"\nInitial DataFrame shape: {combined_df.shape}")

    # Handle missing values
    combined_df = handle_missing_values(combined_df)

    # Define datetime columns
    datetime_cols = [
        'departure.scheduledTime',
        'departure.estimatedTime',
        'departure.actualTime',
        'departure.estimatedRunway',
        'departure.actualRunway',
        'arrival.scheduledTime',
        'arrival.estimatedTime',
        'arrival.actualTime',
        'arrival.estimatedRunway',
        'arrival.actualRunway'
    ]

    # Convert columns to datetime
    combined_df = format_datetime_columns(combined_df, datetime_cols)

    # Calculate departure delay
    combined_df = calculate_departure_delay(combined_df)

    # Extract temporal features
    combined_df = extract_temporal_features(combined_df)

    # Encode categorical variables
    combined_df = encode_categorical_variables(combined_df)

    # Drop irrelevant columns
    combined_df = drop_irrelevant_columns(combined_df)

    # Final verification
    print("\nFinal DataFrame Info:")
    print(combined_df.info())

    print("\nMissing Values After All Processing:")
    print(combined_df.isnull().sum())

    # Save the final cleaned DataFrame
    combined_df.to_csv('final_cleaned_flight_data.csv', index=False)
    print("\nFinal cleaned DataFrame saved as 'final_cleaned_flight_data.csv'.")

if __name__ == "__main__":
    main()


Processing file: 1.docx
Processing file: 10.docx
Processing file: 11.docx
Processing file: 12.docx
Processing file: 13.docx
Processing file: 14.docx
Processing file: 15.docx
Processing file: 16.docx
Processing file: 17.docx
Processing file: 18.docx
Processing file: 19.docx
Processing file: 2.docx
Processing file: 20.docx
Processing file: 21.docx
Processing file: 22.docx
Processing file: 23.docx
Processing file: 24.docx
Processing file: 25.docx
Processing file: 26.docx
Processing file: 27.docx
Processing file: 28.docx
Processing file: 29.docx
Processing file: 3.docx
Processing file: 30.docx
Processing file: 31.docx
Processing file: 32.docx
Processing file: 33.docx
Processing file: 34.docx
Processing file: 35.docx
Processing file: 36.docx
Processing file: 37.docx
Processing file: 39.docx
Processing file: 4.docx
Processing file: 40.docx
Processing file: 41.docx
Processing file: 42.docx
Processing file: 43.docx
Processing file: 44.docx
Processing file: 45.docx
Processing file: 46.docx
Proc

In [41]:
df=pd.read_csv('final_cleaned_flight_data.csv')
df

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,airline.name_srilankan airlines,airline.name_swiss air-ambulance,airline.name_thai airways international,airline.name_turkey - government,airline.name_turkish airlines,airline.name_uls airlines cargo,airline.name_virgin australia,airline.name_vistajet,airline.name_wamos air,airline.name_yto cargo airlines
0,active,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,False,False,False,False,False,False,False,False,False,False
1,active,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,,,,2023-07-18 16:50:00,...,False,False,False,False,False,False,False,False,False,False
2,active,lhe,opla,Unknown,2023-07-23 09:50:00,,,,,2023-07-23 11:35:00,...,False,False,False,False,False,False,False,False,False,False
3,active,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,False,False,False,False,False,False,False,False,False,False
4,active,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,,,,2023-07-20 14:00:00,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,active,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,False,False,False,False,False,False,False,False,False,False
51568,active,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,False,False,False,False,False,False,False,False,False,False
51569,active,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,,,,2023-11-27 13:30:00,...,False,False,False,False,False,False,False,False,False,False
51570,active,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,,,,2023-11-19 10:30:00,...,False,False,False,False,False,False,False,False,False,False


In [49]:
df

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,airline.name_srilankan airlines,airline.name_swiss air-ambulance,airline.name_thai airways international,airline.name_turkey - government,airline.name_turkish airlines,airline.name_uls airlines cargo,airline.name_virgin australia,airline.name_vistajet,airline.name_wamos air,airline.name_yto cargo airlines
0,active,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,False,False,False,False,False,False,False,False,False,False
1,active,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,,,,2023-07-18 16:50:00,...,False,False,False,False,False,False,False,False,False,False
2,active,lhe,opla,Unknown,2023-07-23 09:50:00,,,,,2023-07-23 11:35:00,...,False,False,False,False,False,False,False,False,False,False
3,active,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,False,False,False,False,False,False,False,False,False,False
4,active,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,,,,2023-07-20 14:00:00,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,active,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,False,False,False,False,False,False,False,False,False,False
51568,active,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,False,False,False,False,False,False,False,False,False,False
51569,active,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,,,,2023-11-27 13:30:00,...,False,False,False,False,False,False,False,False,False,False
51570,active,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,,,,2023-11-19 10:30:00,...,False,False,False,False,False,False,False,False,False,False


In [51]:
import json
import pandas as pd
from docx import Document
import os
import re
from sklearn.preprocessing import LabelEncoder

def extract_json_from_docx(file_path):
    """
    Extracts JSON string from a .docx file.
    Assumes that the JSON content is enclosed within [ ] brackets.
    """
    doc = Document(file_path)
    full_text = []
    json_started = False

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue  # Skip empty paragraphs

        # Detect the start of JSON array
        if text.startswith('['):
            json_started = True

        if json_started:
            full_text.append(text)
            # Detect the end of JSON array
            if text.endswith(']'):
                break

    json_str = '\n'.join(full_text)
    
    # Optional: Clean up the JSON string using regex if necessary
    json_str = re.sub(r'(?<!\\)"', r'"', json_str)  # Replace unescaped quotes if necessary
    
    return json_str

def parse_json_to_dataframe(json_str):
    """
    Parses a JSON string to a pandas DataFrame.
    Handles both single JSON array and multiple JSON objects.
    """
    try:
        # Attempt to load the JSON string as a list
        data = json.loads(json_str)
        if isinstance(data, list):
            df = pd.json_normalize(data, sep='.')
            return df
        else:
            # If not a list, wrap it into a list
            df = pd.json_normalize([data], sep='.')
            return df
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return None

def process_docx_files(directory_path):
    """
    Processes all .docx files in the specified directory.
    Returns a combined pandas DataFrame.
    """
    all_dfs = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {filename}")
            json_str = extract_json_from_docx(file_path)
            df = parse_json_to_dataframe(json_str)
            if df is not None:
                all_dfs.append(df)
            else:
                print(f"Failed to parse JSON in file: {filename}")
    
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        return combined_df
    else:
        print("No DataFrames to concatenate.")
        return pd.DataFrame()

def handle_missing_values(df, datetime_cols):
    """
    Handles missing values in the DataFrame.
    - Imputes missing time-related columns with appropriate strategies.
    - Drops columns with >50% missing values.
    - Imputes numerical and categorical columns separately.
    """
    # Calculate missing values percentage
    missing_percentage = df.isnull().mean() * 100
    missing_percentage = missing_percentage.sort_values(ascending=False)
    
    # Identify columns to drop (excluding datetime columns)
    cols_to_drop = missing_percentage[(missing_percentage > 50) & (~missing_percentage.index.isin(datetime_cols))].index.tolist()
    print("Columns to Drop:", cols_to_drop)
    
    # Drop these columns
    df.drop(columns=cols_to_drop, inplace=True)
    
    # Handle time-related columns separately
    for col in datetime_cols:
        if col in df.columns:
            # Replace placeholders like 'Unknown' with NaN
            df[col].replace('Unknown', pd.NaT, inplace=True)
    
    # Convert columns to datetime
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
            print(f"Converted '{col}' to datetime.")
    
    # Impute missing datetime columns
    df = impute_time_columns(df, datetime_cols)
    
    # Identify numerical and categorical columns (excluding datetime)
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Impute numerical columns with median
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_value = df[col].median()
            df[col].fillna(median_value, inplace=True)
            print(f"Imputed missing values in numerical column '{col}' with median value {median_value}.")
    
    # Impute categorical columns with 'Unknown'
    for col in categorical_cols:
        if col in datetime_cols:
            continue  # Skip datetime columns
        if df[col].isnull().sum() > 0:
            df[col].fillna('Unknown', inplace=True)
            print(f"Imputed missing values in categorical column '{col}' with 'Unknown'.")
    
    return df

def impute_time_columns(df, datetime_cols):
    """
    Imputes missing values in datetime columns based on logical hierarchy.
    """
    # Imputation hierarchy for departure times
    if 'departure.actualTime' in df.columns and 'departure.estimatedTime' in df.columns:
        df['departure.actualTime'] = df['departure.actualTime'].fillna(df['departure.estimatedTime'])
        print("Imputed missing 'departure.actualTime' with 'departure.estimatedTime'.")
    
    if 'departure.actualTime' in df.columns and 'departure.scheduledTime' in df.columns:
        df['departure.actualTime'] = df['departure.actualTime'].fillna(df['departure.scheduledTime'])
        print("Imputed remaining missing 'departure.actualTime' with 'departure.scheduledTime'.")
    
    # Imputation hierarchy for arrival times
    if 'arrival.actualTime' in df.columns and 'arrival.estimatedTime' in df.columns:
        df['arrival.actualTime'] = df['arrival.actualTime'].fillna(df['arrival.estimatedTime'])
        print("Imputed missing 'arrival.actualTime' with 'arrival.estimatedTime'.")
    
    if 'arrival.actualTime' in df.columns and 'arrival.scheduledTime' in df.columns:
        df['arrival.actualTime'] = df['arrival.actualTime'].fillna(df['arrival.scheduledTime'])
        print("Imputed remaining missing 'arrival.actualTime' with 'arrival.scheduledTime'.")
    
    # Repeat similar imputation for runway times if applicable
    if 'departure.actualRunway' in df.columns and 'departure.estimatedRunway' in df.columns:
        df['departure.actualRunway'] = df['departure.actualRunway'].fillna(df['departure.estimatedRunway'])
        print("Imputed missing 'departure.actualRunway' with 'departure.estimatedRunway'.")
    
    if 'departure.actualRunway' in df.columns and 'departure.scheduledTime' in df.columns:
        df['departure.actualRunway'] = df['departure.actualRunway'].fillna(df['departure.scheduledTime'])
        print("Imputed remaining missing 'departure.actualRunway' with 'departure.scheduledTime'.")
    
    if 'arrival.actualRunway' in df.columns and 'arrival.estimatedRunway' in df.columns:
        df['arrival.actualRunway'] = df['arrival.actualRunway'].fillna(df['arrival.estimatedRunway'])
        print("Imputed missing 'arrival.actualRunway' with 'arrival.estimatedRunway'.")
    
    if 'arrival.actualRunway' in df.columns and 'arrival.scheduledTime' in df.columns:
        df['arrival.actualRunway'] = df['arrival.actualRunway'].fillna(df['arrival.scheduledTime'])
        print("Imputed remaining missing 'arrival.actualRunway' with 'arrival.scheduledTime'.")
    
    return df

def calculate_departure_delay(df):
    """
    Calculates departure delay in minutes.
    """
    if 'departure.actualTime' in df.columns and 'departure.scheduledTime' in df.columns:
        df['departure.delay_minutes'] = (df['departure.actualTime'] - df['departure.scheduledTime']).dt.total_seconds() / 60
        print("Calculated 'departure.delay_minutes'.")
    else:
        print("Required columns for delay calculation are missing.")
    return df

def extract_temporal_features(df):
    """
    Extracts temporal features from scheduled departure time.
    """
    if 'departure.scheduledTime' in df.columns:
        # Extract day of the week
        df['departure.day_of_week'] = df['departure.scheduledTime'].dt.day_name()
    
        # Extract hour of the day
        df['departure.hour_of_day'] = df['departure.scheduledTime'].dt.hour
    
        # Extract month of the year
        df['departure.month'] = df['departure.scheduledTime'].dt.month
    
        print("Extracted temporal features.")
    else:
        print("Scheduled departure time column is missing.")
    return df

def encode_categorical_variables(df):
    """
    Encodes categorical variables into numerical formats.
    """
    # Initialize LabelEncoder
    le = LabelEncoder()

    # Encode 'status' column (binary or multi-class classification)
    if 'status' in df.columns:
        df['status_encoded'] = le.fit_transform(df['status'])
        print("Encoded 'status' column.")
    
    # One-Hot Encode 'departure.day_of_week'
    if 'departure.day_of_week' in df.columns:
        df = pd.get_dummies(df, columns=['departure.day_of_week'], drop_first=True)
        print("One-Hot Encoded 'departure.day_of_week'.")
    
    # One-Hot Encode 'airline.name'
    if 'airline.name' in df.columns:
        df = pd.get_dummies(df, columns=['airline.name'], drop_first=True)
        print("One-Hot Encoded 'airline.name'.")
    
    return df

def drop_irrelevant_columns(df):
    """
    Drops irrelevant or redundant columns.
    """
    # Example: Drop 'type' column if all records are 'departure'
    if 'type' in df.columns:
        df.drop(columns=['type'], inplace=True)
        print("Dropped 'type' column.")
    
    # Drop 'arrival.iataCode' and 'arrival.icaoCode' if not needed
    columns_to_drop = ['arrival.iataCode', 'arrival.icaoCode']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print(f"Dropped columns: {columns_to_drop}")
    
    return df

def main():
    # Specify the directory containing your .docx files
    directory_path = 'ML-Proj-Dataset/Train/'  # Replace with your actual path

    # Process the .docx files and get the combined DataFrame
    combined_df = process_docx_files(directory_path)

    if combined_df.empty:
        print("No data to process.")
        return

    print(f"\nInitial DataFrame shape: {combined_df.shape}")

    # Define datetime columns
    datetime_cols = [
        'departure.scheduledTime',
        'departure.estimatedTime',
        'departure.actualTime',
        'departure.estimatedRunway',
        'departure.actualRunway',
        'arrival.scheduledTime',
        'arrival.estimatedTime',
        'arrival.actualTime',
        'arrival.estimatedRunway',
        'arrival.actualRunway'
    ]

    # Handle missing values and convert datetime columns
    combined_df = handle_missing_values(combined_df, datetime_cols)

    # Calculate departure delay
    combined_df = calculate_departure_delay(combined_df)

    # Extract temporal features
    combined_df = extract_temporal_features(combined_df)

    # Encode categorical variables
    combined_df = encode_categorical_variables(combined_df)

    # Drop irrelevant columns
    combined_df = drop_irrelevant_columns(combined_df)

    # Final verification
    print("\nFinal DataFrame Info:")
    print(combined_df.info())

    print("\nMissing Values After All Processing:")
    print(combined_df.isnull().sum())

    # Save the final cleaned DataFrame
    combined_df.to_csv('final_cleaned_flight_data1.csv', index=False)
    print("\nFinal cleaned DataFrame saved as 'final_cleaned_flight_data.csv'.")

if __name__ == "__main__":
    main()


Processing file: 1.docx
Processing file: 10.docx
Processing file: 11.docx
Processing file: 12.docx
Processing file: 13.docx
Processing file: 14.docx
Processing file: 15.docx
Processing file: 16.docx
Processing file: 17.docx
Processing file: 18.docx
Processing file: 19.docx
Processing file: 2.docx
Processing file: 20.docx
Processing file: 21.docx
Processing file: 22.docx
Processing file: 23.docx
Processing file: 24.docx
Processing file: 25.docx
Processing file: 26.docx
Processing file: 27.docx
Processing file: 28.docx
Processing file: 29.docx
Processing file: 3.docx
Processing file: 30.docx
Processing file: 31.docx
Processing file: 32.docx
Processing file: 33.docx
Processing file: 34.docx
Processing file: 35.docx
Processing file: 36.docx
Processing file: 37.docx
Processing file: 39.docx
Processing file: 4.docx
Processing file: 40.docx
Processing file: 41.docx
Processing file: 42.docx
Processing file: 43.docx
Processing file: 44.docx
Processing file: 45.docx
Processing file: 46.docx
Proc

In [53]:
df=pd.read_csv('final_cleaned_flight_data1.csv')


In [85]:
df


Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,airline.name_srilankan airlines,airline.name_swiss air-ambulance,airline.name_thai airways international,airline.name_turkey - government,airline.name_turkish airlines,airline.name_uls airlines cargo,airline.name_virgin australia,airline.name_vistajet,airline.name_wamos air,airline.name_yto cargo airlines
0,active,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,False,False,False,False,False,False,False,False,False,False
1,active,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-18 15:05:00,,2023-07-18 15:05:00,2023-07-18 16:50:00,...,False,False,False,False,False,False,False,False,False,False
2,active,lhe,opla,Unknown,2023-07-23 09:50:00,,2023-07-23 09:50:00,,2023-07-23 09:50:00,2023-07-23 11:35:00,...,False,False,False,False,False,False,False,False,False,False
3,active,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,False,False,False,False,False,False,False,False,False,False
4,active,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,2023-07-20 17:15:00,,2023-07-20 11:35:00,2023-07-20 14:00:00,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,active,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,False,False,False,False,False,False,False,False,False,False
51568,active,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,False,False,False,False,False,False,False,False,False,False
51569,active,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,2023-11-27 14:35:00,,2023-11-27 10:50:00,2023-11-27 13:30:00,...,False,False,False,False,False,False,False,False,False,False
51570,active,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,2023-11-19 11:00:00,,2023-11-19 02:00:00,2023-11-19 10:30:00,...,False,False,False,False,False,False,False,False,False,False


In [71]:
df.isnull().sum().head(50)


status                                       0
departure.iataCode                           0
departure.icaoCode                           0
departure.terminal                           0
departure.scheduledTime                      0
departure.estimatedTime                   7007
departure.actualTime                         0
departure.estimatedRunway                20582
departure.actualRunway                       0
arrival.scheduledTime                        0
arrival.estimatedTime                    21364
airline.iataCode                            29
airline.icaoCode                             0
flight.number                                0
flight.iataNumber                           29
flight.icaoNumber                            0
arrival.actualTime                           0
arrival.estimatedRunway                  51320
arrival.actualRunway                         0
departure.delay_minutes                      0
departure.hour_of_day                        0
departure.mon

In [79]:
df.isnull().sum().tail(50)


airline.name_iberia                             0
airline.name_indigo                             0
airline.name_iran air                           0
airline.name_iraqi airways                      0
airline.name_jazeera airways                    0
airline.name_jetblue airways                    0
airline.name_kam air                            0
airline.name_kenya airways                      0
airline.name_klm                                0
airline.name_kuwait airways                     0
airline.name_lion air                           0
airline.name_mahan air                          0
airline.name_malaysia airlines                  0
airline.name_maleth-aero                        0
airline.name_malindo air                        0
airline.name_maximus airlines                   0
airline.name_mng airlines                       0
airline.name_oman air                           0
airline.name_pakistan international airlines    0
airline.name_panorama airways                   0


In [93]:
df['airline.name_yto cargo airlines'].isnull().sum()

0

In [115]:
#10 upto sirilinkan no null values found
df['airline.name_srilankan airlines'].isnull().sum()


0

In [95]:
df.isnull().sum().head(23

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,airline.name_srilankan airlines,airline.name_swiss air-ambulance,airline.name_thai airways international,airline.name_turkey - government,airline.name_turkish airlines,airline.name_uls airlines cargo,airline.name_virgin australia,airline.name_vistajet,airline.name_wamos air,airline.name_yto cargo airlines
0,active,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,False,False,False,False,False,False,False,False,False,False
1,active,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-18 15:05:00,,2023-07-18 15:05:00,2023-07-18 16:50:00,...,False,False,False,False,False,False,False,False,False,False
2,active,lhe,opla,Unknown,2023-07-23 09:50:00,,2023-07-23 09:50:00,,2023-07-23 09:50:00,2023-07-23 11:35:00,...,False,False,False,False,False,False,False,False,False,False
3,active,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,False,False,False,False,False,False,False,False,False,False
4,active,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,2023-07-20 17:15:00,,2023-07-20 11:35:00,2023-07-20 14:00:00,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,active,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,False,False,False,False,False,False,False,False,False,False
51568,active,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,False,False,False,False,False,False,False,False,False,False
51569,active,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,2023-11-27 14:35:00,,2023-11-27 10:50:00,2023-11-27 13:30:00,...,False,False,False,False,False,False,False,False,False,False
51570,active,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,2023-11-19 11:00:00,,2023-11-19 02:00:00,2023-11-19 10:30:00,...,False,False,False,False,False,False,False,False,False,False


In [1]:
import pandas as pd


In [3]:
data=pd.read_csv("merged_flight_weather_data.csv")
data

Unnamed: 0,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,departure.actualRunway,arrival.scheduledTime,...,Humidity (%) Max,Humidity (%) Avg,Humidity (%) Min,Wind Speed (mph) Max,Wind Speed (mph) Avg,Wind Speed (mph) Min,Pressure (in) Max,Pressure (in) Avg,Pressure (in) Min,Precipitation (in) Total
0,active,lhe,opla,m,2023-07-20 20:50:00,2023-07-20 20:00:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 20:15:00,2023-07-20 23:20:00,...,89.0,76.0,63.0,12.0,5.3,0.0,28.9,28.8,28.7,0.0
1,active,lhe,opla,Unknown,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-18 15:05:00,2023-07-20 20:15:00,2023-07-18 15:05:00,2023-07-18 16:50:00,...,84.0,73.8,63.0,17.0,10.2,0.0,28.8,28.8,28.7,0.0
2,active,lhe,opla,Unknown,2023-07-23 09:50:00,2023-07-18 15:05:00,2023-07-23 09:50:00,2023-07-20 20:15:00,2023-07-23 09:50:00,2023-07-23 11:35:00,...,83.0,71.2,56.0,14.0,8.1,0.0,29.0,28.9,28.8,0.0
3,active,lhe,opla,m,2023-07-26 23:30:00,2023-07-26 23:30:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-26 23:51:00,2023-07-27 01:30:00,...,100.0,77.4,59.0,17.0,8.1,0.0,28.9,28.9,28.8,0.0
4,active,lhe,opla,m,2023-07-20 11:35:00,2023-07-20 17:15:00,2023-07-20 17:15:00,2023-07-26 23:51:00,2023-07-20 11:35:00,2023-07-20 14:00:00,...,89.0,76.0,63.0,12.0,5.3,0.0,28.9,28.8,28.7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51866,active,lhe,opla,Unknown,2023-11-16 09:50:00,2023-11-16 09:50:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 10:03:00,2023-11-16 11:45:00,...,82.0,65.5,42.0,7.0,0.8,0.0,29.4,29.3,29.3,0.0
51867,active,lhe,opla,m,2023-11-16 11:40:00,2023-11-16 11:40:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 11:48:00,2023-11-16 15:25:00,...,82.0,65.5,42.0,7.0,0.8,0.0,29.4,29.3,29.3,0.0
51868,active,lhe,opla,m,2023-11-27 10:50:00,2023-11-27 14:35:00,2023-11-27 14:35:00,2023-11-16 11:48:00,2023-11-27 10:50:00,2023-11-27 13:30:00,...,88.0,74.4,60.0,12.0,2.3,0.0,29.3,29.3,29.2,0.0
51869,active,lhe,opla,m,2023-11-19 02:00:00,2023-11-19 11:00:00,2023-11-19 11:00:00,2023-11-16 11:48:00,2023-11-19 02:00:00,2023-11-19 10:30:00,...,88.0,66.3,42.0,8.0,1.5,0.0,29.4,28.7,0.0,0.0


# encoding the status 

In [5]:
# Encode 'status' column if it's categorical
data['status'] = data['status'].astype('category').cat.codes
#no need to do it as train dataset in cateogrical
# One-Hot Encode 'Departure_DayOfWeek'
data = pd.get_dummies(data, columns=['Departure_DayOfWeek'], drop_first=True)

# One-Hot Encode 'airline.name' if necessary
# Example:
# merged_test_data = pd.get_dummies(merged_test_data, columns=['airline.name'], drop_first=True)

# Verify the encoding
print("\nTest Data After Encoding:")
print(data.head())



Test Data After Encoding:
   status departure.iataCode departure.icaoCode departure.terminal  \
0       0                lhe               opla                  m   
1       0                lhe               opla            Unknown   
2       0                lhe               opla            Unknown   
3       0                lhe               opla                  m   
4       0                lhe               opla                  m   

  departure.scheduledTime departure.estimatedTime departure.actualTime  \
0     2023-07-20 20:50:00     2023-07-20 20:00:00  2023-07-20 20:15:00   
1     2023-07-18 15:05:00     2023-07-18 15:05:00  2023-07-18 15:05:00   
2     2023-07-23 09:50:00     2023-07-18 15:05:00  2023-07-23 09:50:00   
3     2023-07-26 23:30:00     2023-07-26 23:30:00  2023-07-26 23:51:00   
4     2023-07-20 11:35:00     2023-07-20 17:15:00  2023-07-20 17:15:00   

  departure.estimatedRunway departure.actualRunway arrival.scheduledTime  ...  \
0       2023-07-20 20:15:0

# saving the cleaned dataset file

In [14]:
# Save the merged and cleaned test dataset
data.to_csv('merged_cleaned_train_flight_weather_data.csv', index=False)

print("\nFinal Cleaned Test Data Saved as 'merged_cleaned_train_flight_weather_data.csv'.")



Final Cleaned Test Data Saved as 'merged_cleaned_train_flight_weather_data.csv'.
