In [24]:
import os
import re
import json
import pandas as pd
from docx import Document
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import joblib

def extract_json_from_docx(file_path):
    """
    Extracts JSON string from a .docx file.
    Assumes that the JSON content is enclosed within [ ] brackets.
    """
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

    full_text = []
    json_started = False

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue  # Skip empty paragraphs

        # Detect the start of JSON array
        if text.startswith('['):
            json_started = True

        if json_started:
            full_text.append(text)
            # Detect the end of JSON array
            if text.endswith(']'):
                break

    json_str = '\n'.join(full_text)

    # Optional: Clean up the JSON string using regex if necessary
    # For example, remove unwanted characters or fix formatting issues
    json_str = re.sub(r'(?<!\\)"', r'"', json_str)  # Replace unescaped quotes if necessary

    return json_str

def parse_json_to_dataframe(json_str):
    """
    Parses a JSON string to a pandas DataFrame.
    Handles both single JSON array and multiple JSON objects.
    """
    if not json_str:
        return None

    try:
        # Attempt to load the JSON string as a list
        data = json.loads(json_str)
        if isinstance(data, list):
            df = pd.json_normalize(data, sep='.')
            return df
        else:
            # If not a list, wrap it into a list
            df = pd.json_normalize([data], sep='.')
            return df
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return None

def process_docx_files(directory_path):
    """
    Processes all .docx files in the specified directory.
    Returns a combined pandas DataFrame.
    """
    all_dfs = []
    
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {filename}")
            json_str = extract_json_from_docx(file_path)
            df = parse_json_to_dataframe(json_str)
            if df is not None:
                all_dfs.append(df)
            else:
                print(f"Failed to parse JSON in file: {filename}")
    
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        return combined_df
    else:
        print("No DataFrames to concatenate.")
        return pd.DataFrame()

def encode_airline_names(df, top_n=10):
    """
    Encodes 'airline.name' by one-hot encoding the top N frequent airlines.
    Groups the rest as 'Other'.
    """
    if 'airline.name' in df.columns:
        top_airlines = df['airline.name'].value_counts().nlargest(top_n).index
        df['airline.name'] = df['airline.name'].apply(lambda x: x if x in top_airlines else 'Other')
        df = pd.get_dummies(df, columns=['airline.name'], prefix='airline', drop_first=True)
        print(f"One-Hot Encoded 'airline.name' with top {top_n} categories.")
    return df

def encode_codeshare_airline_names(df, top_n=10):
    """
    Encodes 'codeshared.airline.name' by one-hot encoding the top N frequent airlines.
    Groups the rest as 'Other'.
    """
    # Adjust the column name based on actual DataFrame columns
    possible_column_names = ['codeshared.airline.name', 'codeshare.airline.name', 'codeshare_airline.name']
    column_found = False
    for col in possible_column_names:
        if col in df.columns:
            top_airlines = df[col].value_counts().nlargest(top_n).index
            df[col] = df[col].apply(lambda x: x if x in top_airlines else 'Other')
            df = pd.get_dummies(df, columns=[col], prefix='codeshare_airline', drop_first=True)
            print(f"One-Hot Encoded '{col}' with top {top_n} categories.")
            column_found = True
            break
    if not column_found:
        print("No 'codeshared.airline.name' column found to encode.")
    return df

def encode_categorical_variables(df):
    """
    Encodes categorical variables into numerical formats.
    """
    # Initialize LabelEncoder
    le = LabelEncoder()

    # Encode 'status' column (binary or multi-class classification)
    if 'status' in df.columns:
        df['status_encoded'] = le.fit_transform(df['status'])
        print("Encoded 'status' column.")

    # One-Hot Encode 'departure.day_of_week' if exists
    if 'departure.day_of_week' in df.columns:
        df = pd.get_dummies(df, columns=['departure.day_of_week'], drop_first=True)
        print("One-Hot Encoded 'departure.day_of_week'.")

    # One-Hot Encode 'airline.name' with top categories
    df = encode_airline_names(df, top_n=10)

    return df

def encode_all_categorical_variables(df, top_n=10):
    """
    Encodes all relevant categorical variables into numerical formats.
    """
    df = encode_categorical_variables(df)
    df = encode_codeshare_airline_names(df, top_n=top_n)
    return df

def drop_high_missing_columns(df, threshold=50):
    """
    Drops columns from the DataFrame where the percentage of missing values exceeds the threshold.
    """
    missing_percentage = df.isnull().mean() * 100
    cols_to_drop = missing_percentage[missing_percentage > threshold].index.tolist()
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"Dropped columns with >{threshold}% missing values: {cols_to_drop}")
    return df

def impute_datetime_columns(df, datetime_cols):
    """
    Converts specified columns to datetime and imputes missing values based on a logical hierarchy.
    """
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
            print(f"Converted '{col}' to datetime.")

    # Example Imputation Strategy for 'departure.actualTime'
    if 'departure.actualTime' in df.columns and 'departure.estimatedTime' in df.columns:
        df['departure.actualTime'].fillna(df['departure.estimatedTime'], inplace=True)
        print("Imputed missing 'departure.actualTime' with 'departure.estimatedTime'.")

    if 'arrival.actualTime' in df.columns and 'arrival.estimatedTime' in df.columns:
        df['arrival.actualTime'].fillna(df['arrival.estimatedTime'], inplace=True)
        print("Imputed missing 'arrival.actualTime' with 'arrival.estimatedTime'.")

    return df

def impute_remaining_columns(df):
    """
    Imputes remaining numerical columns with median and categorical columns with 'Unknown'.
    """
    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'uint8']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Impute numerical columns with median
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_value = df[col].median()
            df[col].fillna(median_value, inplace=True)
            print(f"Imputed missing values in numerical column '{col}' with median value {median_value}.")

    # Impute categorical columns with 'Unknown'
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna('Unknown', inplace=True)
            print(f"Imputed missing values in categorical column '{col}' with 'Unknown'.")

    return df

def remove_low_variance_features(df, threshold=0.0):
    """
    Removes numerical features with variance below the specified threshold.
    Non-numerical columns are retained without modification.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - threshold (float): The variance threshold.
    
    Returns:
    - pd.DataFrame: The DataFrame with low variance numerical features removed.
    """
    # Select numerical columns (int64, float64, uint8)
    numeric_cols = df.select_dtypes(include=['int64', 'float64', 'uint8']).columns.tolist()

    if not numeric_cols:
        print("No numerical columns to apply VarianceThreshold.")
        return df

    # Initialize VarianceThreshold
    selector = VarianceThreshold(threshold=threshold)

    # Fit the selector on numerical data
    try:
        selector.fit(df[numeric_cols])
    except ValueError as e:
        print(f"VarianceThreshold failed: {e}")
        return df

    # Get the columns to keep
    features_to_keep = [col for col, keep in zip(numeric_cols, selector.get_support()) if keep]

    # Retain selected numerical features and all non-numerical features
    non_numeric_cols = df.columns.difference(numeric_cols)
    df_filtered = pd.concat([df[non_numeric_cols], df[features_to_keep]], axis=1)

    print(f"Removed low variance features. Remaining features: {len(df_filtered.columns)}")
    
    return df_filtered

def encode_and_save_label_encoder(df, column, encoder_filename):
    """
    Encodes a categorical column using LabelEncoder and saves the encoder.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the column.
    - column (str): The name of the column to encode.
    - encoder_filename (str): The filename to save the encoder.
    
    Returns:
    - pd.DataFrame: The DataFrame with the encoded column.
    """
    if column in df.columns:
        le = LabelEncoder()
        df[f'{column}_encoded'] = le.fit_transform(df[column])
        joblib.dump(le, encoder_filename)
        print(f"Encoded '{column}' column and saved LabelEncoder as '{encoder_filename}'.")
    return df

def main():
    # Specify the directory containing your .docx files
    directory_path = 'ML-Proj-Dataset/Train/'  # Replace with your actual path

    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory '{directory_path}' does not exist. Please check the path.")
        return

    # Process the .docx files and get the combined DataFrame
    combined_df = process_docx_files(directory_path)

    if combined_df.empty:
        print("No data to process.")
        return

    print(f"\nInitial DataFrame shape: {combined_df.shape}")

    # Define datetime columns
    datetime_cols = [
        'departure.scheduledTime',
        'departure.estimatedTime',
        'departure.actualTime',
        'departure.estimatedRunway',
        'departure.actualRunway',
        'arrival.scheduledTime',
        'arrival.estimatedTime',
        'arrival.actualTime',
        'arrival.estimatedRunway',
        'arrival.actualRunway'
    ]

    # Drop columns with >50% missing values
    combined_df = drop_high_missing_columns(combined_df, threshold=50)

    # Convert and impute datetime columns
    combined_df = impute_datetime_columns(combined_df, datetime_cols)

    # Impute remaining numerical and categorical columns
    combined_df = impute_remaining_columns(combined_df)

    # Encode categorical variables with optimized methods
    combined_df = encode_all_categorical_variables(combined_df, top_n=10)

    # Remove low variance numerical features
    combined_df = remove_low_variance_features(combined_df, threshold=0.01)  # Adjust threshold as needed

    # Calculate departure delay if possible
    if 'departure.actualTime' in combined_df.columns and 'departure.scheduledTime' in combined_df.columns:
        combined_df['departure.delay_minutes'] = (combined_df['departure.actualTime'] - combined_df['departure.scheduledTime']).dt.total_seconds() / 60
        print("Calculated 'departure.delay_minutes'.")

    # Extract temporal features if needed
    if 'departure.scheduledTime' in combined_df.columns:
        combined_df['departure.day_of_week'] = combined_df['departure.scheduledTime'].dt.day_name()
        combined_df['departure.hour_of_day'] = combined_df['departure.scheduledTime'].dt.hour
        combined_df['departure.month'] = combined_df['departure.scheduledTime'].dt.month
        print("Extracted temporal features.")

        # One-Hot Encode temporal features
        combined_df = pd.get_dummies(combined_df, columns=['departure.day_of_week'], drop_first=True)
        print("One-Hot Encoded 'departure.day_of_week'.")

    # Remove low variance numerical features again after adding temporal features
    combined_df = remove_low_variance_features(combined_df, threshold=0.0)  # Remove features with zero variance

    # Save LabelEncoder for 'status'
    if 'status_encoded' in combined_df.columns:
        le_status = LabelEncoder()
        le_status.fit(combined_df['status'])
        joblib.dump(le_status, 'label_encoder_status.pkl')
        print("Saved LabelEncoder for 'status' column as 'label_encoder_status.pkl'.")

    # Save the final cleaned DataFrame
    combined_df.to_csv('final_cleaned_flight_data1111.csv', index=False)
    print("\nFinal cleaned DataFrame saved as 'final_cleaned_flight_data.csv'.")

    # Display final DataFrame information
    print("\nFinal DataFrame shape:", combined_df.shape)
    print("\nFinal DataFrame Info:")
    print(combined_df.info())

    print("\nMissing Values After All Processing:")
    print(combined_df.isnull().sum())

if __name__ == "__main__":
    main()


Processing file: 1.docx
Processing file: 10.docx
Processing file: 11.docx
Processing file: 12.docx
Processing file: 13.docx
Processing file: 14.docx
Processing file: 15.docx
Processing file: 16.docx
Processing file: 17.docx
Processing file: 18.docx
Processing file: 19.docx
Processing file: 2.docx
Processing file: 20.docx
Processing file: 21.docx
Processing file: 22.docx
Processing file: 23.docx
Processing file: 24.docx
Processing file: 25.docx
Processing file: 26.docx
Processing file: 27.docx
Processing file: 28.docx
Processing file: 29.docx
Processing file: 3.docx
Processing file: 30.docx
Processing file: 31.docx
Processing file: 32.docx
Processing file: 33.docx
Processing file: 34.docx
Processing file: 35.docx
Processing file: 36.docx
Processing file: 37.docx
Processing file: 39.docx
Processing file: 4.docx
Processing file: 40.docx
Processing file: 41.docx
Processing file: 42.docx
Processing file: 43.docx
Processing file: 44.docx
Processing file: 45.docx
Processing file: 46.docx
Proc

In [26]:
import os
import re
import json
import pandas as pd
from docx import Document
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import joblib

def extract_json_from_docx(file_path):
    """
    Extracts JSON string from a .docx file.
    Assumes that the JSON content is enclosed within [ ] brackets.
    """
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

    full_text = []
    json_started = False

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue  # Skip empty paragraphs

        # Detect the start of JSON array
        if text.startswith('['):
            json_started = True

        if json_started:
            full_text.append(text)
            # Detect the end of JSON array
            if text.endswith(']'):
                break

    json_str = '\n'.join(full_text)

    # Optional: Clean up the JSON string using regex if necessary
    # For example, remove unwanted characters or fix formatting issues
    json_str = re.sub(r'(?<!\\)"', r'"', json_str)  # Replace unescaped quotes if necessary

    return json_str

def parse_json_to_dataframe(json_str):
    """
    Parses a JSON string to a pandas DataFrame.
    Handles both single JSON array and multiple JSON objects.
    """
    if not json_str:
        return None

    try:
        # Attempt to load the JSON string as a list
        data = json.loads(json_str)
        if isinstance(data, list):
            df = pd.json_normalize(data, sep='.')
            return df
        else:
            # If not a list, wrap it into a list
            df = pd.json_normalize([data], sep='.')
            return df
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return None

def process_docx_files(directory_path):
    """
    Processes all .docx files in the specified directory.
    Returns a combined pandas DataFrame.
    """
    all_dfs = []
    
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {filename}")
            json_str = extract_json_from_docx(file_path)
            df = parse_json_to_dataframe(json_str)
            if df is not None:
                all_dfs.append(df)
            else:
                print(f"Failed to parse JSON in file: {filename}")
    
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        return combined_df
    else:
        print("No DataFrames to concatenate.")
        return pd.DataFrame()

def encode_airline_names(df, top_n=10):
    """
    Encodes 'airline.name' by one-hot encoding the top N frequent airlines.
    Groups the rest as 'Other'.
    """
    if 'airline.name' in df.columns:
        top_airlines = df['airline.name'].value_counts().nlargest(top_n).index
        df['airline.name'] = df['airline.name'].apply(lambda x: x if x in top_airlines else 'Other')
        df = pd.get_dummies(df, columns=['airline.name'], prefix='airline', drop_first=True)
        print(f"One-Hot Encoded 'airline.name' with top {top_n} categories.")
    return df

def encode_codeshare_airline_names(df, top_n=10):
    """
    Encodes 'codeshared.airline.name' by one-hot encoding the top N frequent airlines.
    Groups the rest as 'Other'.
    """
    # Adjust the column name based on actual DataFrame columns
    possible_column_names = ['codeshared.airline.name', 'codeshare.airline.name', 'codeshare_airline.name']
    column_found = False
    for col in possible_column_names:
        if col in df.columns:
            top_airlines = df[col].value_counts().nlargest(top_n).index
            df[col] = df[col].apply(lambda x: x if x in top_airlines else 'Other')
            df = pd.get_dummies(df, columns=[col], prefix='codeshare_airline', drop_first=True)
            print(f"One-Hot Encoded '{col}' with top {top_n} categories.")
            column_found = True
            break
    if not column_found:
        print("No 'codeshared.airline.name' column found to encode.")
    return df

def encode_categorical_variables(df):
    """
    Encodes categorical variables into numerical formats.
    """
    # Initialize LabelEncoder
    le = LabelEncoder()

    # Encode 'status' column (binary or multi-class classification)
    if 'status' in df.columns:
        df['status_encoded'] = le.fit_transform(df['status'])
        print("Encoded 'status' column.")

    # One-Hot Encode 'departure.day_of_week' if exists
    if 'departure.day_of_week' in df.columns:
        df = pd.get_dummies(df, columns=['departure.day_of_week'], drop_first=True)
        print("One-Hot Encoded 'departure.day_of_week'.")

    # One-Hot Encode 'airline.name' with top categories
    df = encode_airline_names(df, top_n=10)

    return df

def encode_all_categorical_variables(df, top_n=10):
    """
    Encodes all relevant categorical variables into numerical formats.
    """
    df = encode_categorical_variables(df)
    df = encode_codeshare_airline_names(df, top_n=top_n)
    return df

def drop_high_missing_columns(df, threshold=50):
    """
    Drops columns from the DataFrame where the percentage of missing values exceeds the threshold.
    """
    missing_percentage = df.isnull().mean() * 100
    cols_to_drop = missing_percentage[missing_percentage > threshold].index.tolist()
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"Dropped columns with >{threshold}% missing values: {cols_to_drop}")
    return df

def impute_datetime_columns(df, datetime_cols):
    """
    Converts specified columns to datetime and imputes missing values based on a logical hierarchy.
    """
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
            print(f"Converted '{col}' to datetime.")

    # Impute 'departure.estimatedTime' and 'arrival.estimatedTime' with median
    for col in ['departure.estimatedTime', 'arrival.estimatedTime']:
        if col in df.columns and df[col].isnull().sum() > 0:
            median_time = df[col].median()
            df[col].fillna(median_time, inplace=True)
            print(f"Imputed missing '{col}' with median time {median_time}.")

    # Now, impute 'departure.actualTime' with 'departure.estimatedTime'
    if 'departure.actualTime' in df.columns and 'departure.estimatedTime' in df.columns:
        df['departure.actualTime'].fillna(df['departure.estimatedTime'], inplace=True)
        print("Imputed missing 'departure.actualTime' with 'departure.estimatedTime'.")

    # Similarly, impute 'arrival.actualTime' with 'arrival.estimatedTime'
    if 'arrival.actualTime' in df.columns and 'arrival.estimatedTime' in df.columns:
        df['arrival.actualTime'].fillna(df['arrival.estimatedTime'], inplace=True)
        print("Imputed missing 'arrival.actualTime' with 'arrival.estimatedTime'.")

    return df

def impute_remaining_columns(df):
    """
    Imputes remaining numerical columns with median and categorical columns with 'Unknown'.
    """
    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'uint8']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Impute numerical columns with median
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_value = df[col].median()
            df[col].fillna(median_value, inplace=True)
            print(f"Imputed missing values in numerical column '{col}' with median value {median_value}.")

    # Impute categorical columns with 'Unknown'
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna('Unknown', inplace=True)
            print(f"Imputed missing values in categorical column '{col}' with 'Unknown'.")

    return df

def remove_low_variance_features(df, threshold=0.0):
    """
    Removes numerical features with variance below the specified threshold.
    Non-numerical columns are retained without modification.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - threshold (float): The variance threshold.
    
    Returns:
    - pd.DataFrame: The DataFrame with low variance numerical features removed.
    """
    # Select numerical columns (int64, float64, uint8)
    numeric_cols = df.select_dtypes(include=['int64', 'float64', 'uint8']).columns.tolist()

    if not numeric_cols:
        print("No numerical columns to apply VarianceThreshold.")
        return df

    # Initialize VarianceThreshold
    selector = VarianceThreshold(threshold=threshold)

    # Fit the selector on numerical data
    try:
        selector.fit(df[numeric_cols])
    except ValueError as e:
        print(f"VarianceThreshold failed: {e}")
        return df

    # Get the columns to keep
    features_to_keep = [col for col, keep in zip(numeric_cols, selector.get_support()) if keep]

    # Retain selected numerical features and all non-numerical features
    non_numeric_cols = df.columns.difference(numeric_cols)
    df_filtered = pd.concat([df[non_numeric_cols], df[features_to_keep]], axis=1)

    print(f"Removed low variance features. Remaining features: {len(df_filtered.columns)}")
    
    return df_filtered

def encode_and_save_label_encoder(df, column, encoder_filename):
    """
    Encodes a categorical column using LabelEncoder and saves the encoder.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the column.
    - column (str): The name of the column to encode.
    - encoder_filename (str): The filename to save the encoder.
    
    Returns:
    - pd.DataFrame: The DataFrame with the encoded column.
    """
    if column in df.columns:
        le = LabelEncoder()
        df[f'{column}_encoded'] = le.fit_transform(df[column])
        joblib.dump(le, encoder_filename)
        print(f"Encoded '{column}' column and saved LabelEncoder as '{encoder_filename}'.")
    return df

def main():
    # Specify the directory containing your .docx files
    directory_path = 'ML-Proj-Dataset/Train/'  # Replace with your actual path

    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory '{directory_path}' does not exist. Please check the path.")
        return

    # Process the .docx files and get the combined DataFrame
    combined_df = process_docx_files(directory_path)

    if combined_df.empty:
        print("No data to process.")
        return

    print(f"\nInitial DataFrame shape: {combined_df.shape}")

    # Define datetime columns
    datetime_cols = [
        'departure.scheduledTime',
        'departure.estimatedTime',
        'departure.actualTime',
        'departure.estimatedRunway',
        'departure.actualRunway',
        'arrival.scheduledTime',
        'arrival.estimatedTime',
        'arrival.actualTime',
        'arrival.estimatedRunway',
        'arrival.actualRunway'
    ]

    # Drop columns with >50% missing values
    combined_df = drop_high_missing_columns(combined_df, threshold=50)

    # Convert and impute datetime columns
    combined_df = impute_datetime_columns(combined_df, datetime_cols)

    # Impute remaining numerical and categorical columns
    combined_df = impute_remaining_columns(combined_df)

    # Encode categorical variables with optimized methods
    combined_df = encode_all_categorical_variables(combined_df, top_n=10)

    # Remove low variance numerical features
    combined_df = remove_low_variance_features(combined_df, threshold=0.01)  # Adjust threshold as needed

    # Calculate departure delay if possible
    if 'departure.actualTime' in combined_df.columns and 'departure.scheduledTime' in combined_df.columns:
        combined_df['departure.delay_minutes'] = (combined_df['departure.actualTime'] - combined_df['departure.scheduledTime']).dt.total_seconds() / 60
        print("Calculated 'departure.delay_minutes'.")

    # Extract temporal features if needed
    if 'departure.scheduledTime' in combined_df.columns:
        combined_df['departure.day_of_week'] = combined_df['departure.scheduledTime'].dt.day_name()
        combined_df['departure.hour_of_day'] = combined_df['departure.scheduledTime'].dt.hour
        combined_df['departure.month'] = combined_df['departure.scheduledTime'].dt.month
        print("Extracted temporal features.")

        # One-Hot Encode temporal features
        combined_df = pd.get_dummies(combined_df, columns=['departure.day_of_week'], drop_first=True)
        print("One-Hot Encoded 'departure.day_of_week'.")

    # Remove low variance numerical features again after adding temporal features
    combined_df = remove_low_variance_features(combined_df, threshold=0.0)  # Remove features with zero variance

    # Impute 'departure.delay_minutes' with median if there are still missing values
    if 'departure.delay_minutes' in combined_df.columns and combined_df['departure.delay_minutes'].isnull().sum() > 0:
        median_delay = combined_df['departure.delay_minutes'].median()
        combined_df['departure.delay_minutes'].fillna(median_delay, inplace=True)
        print(f"Imputed missing 'departure.delay_minutes' with median value {median_delay}.")

    # Save LabelEncoder for 'status'
    if 'status_encoded' in combined_df.columns:
        le_status = LabelEncoder()
        le_status.fit(combined_df['status'])
        joblib.dump(le_status, 'label_encoder_status.pkl')
        print("Saved LabelEncoder for 'status' column as 'label_encoder_status.pkl'.")

    # Save the final cleaned DataFrame
    combined_df.to_csv('final_cleaned_flight_data.csv', index=False)
    print("\nFinal cleaned DataFrame saved as 'final_cleaned_flight_data.csv'.")

    # Display final DataFrame information
    print("\nFinal DataFrame shape:", combined_df.shape)
    print("\nFinal DataFrame Info:")
    print(combined_df.info())

    print("\nMissing Values After All Processing:")
    print(combined_df.isnull().sum())

if __name__ == "__main__":
    main()


Processing file: 1.docx
Processing file: 10.docx
Processing file: 11.docx
Processing file: 12.docx
Processing file: 13.docx
Processing file: 14.docx
Processing file: 15.docx
Processing file: 16.docx
Processing file: 17.docx
Processing file: 18.docx
Processing file: 19.docx
Processing file: 2.docx
Processing file: 20.docx
Processing file: 21.docx
Processing file: 22.docx
Processing file: 23.docx
Processing file: 24.docx
Processing file: 25.docx
Processing file: 26.docx
Processing file: 27.docx
Processing file: 28.docx
Processing file: 29.docx
Processing file: 3.docx
Processing file: 30.docx
Processing file: 31.docx
Processing file: 32.docx
Processing file: 33.docx
Processing file: 34.docx
Processing file: 35.docx
Processing file: 36.docx
Processing file: 37.docx
Processing file: 39.docx
Processing file: 4.docx
Processing file: 40.docx
Processing file: 41.docx
Processing file: 42.docx
Processing file: 43.docx
Processing file: 44.docx
Processing file: 45.docx
Processing file: 46.docx
Proc

In [28]:
data=pd.read_csv('final_cleaned_flight_data.csv')
data

Unnamed: 0,airline.iataCode,airline.icaoCode,airline_airblue,airline_airsial,airline_british airways,airline_emirates,airline_flyjinnah,airline_klm,airline_oman air,airline_pakistan international airlines,...,departure.month,departure.scheduledTime,departure.terminal,flight.iataNumber,flight.icaoNumber,flight.number,status,status_encoded,type,departure.delay_minutes
0,sv,sva,False,False,False,False,False,False,False,False,...,7,2023-07-20 20:50:00,m,sv737,sva737,737,active,0,departure,-35.0
1,9p,fjl,False,False,False,False,True,False,False,False,...,7,2023-07-18 15:05:00,Unknown,9p843,fjl843,843,active,0,departure,0.0
2,9p,fjl,False,False,False,False,True,False,False,False,...,7,2023-07-23 09:50:00,Unknown,9p841,fjl841,841,active,0,departure,237370.0
3,pk,pia,False,False,False,False,False,False,False,True,...,7,2023-07-26 23:30:00,m,pk205,pia205,205,active,0,departure,21.0
4,er,sep,False,False,False,False,False,False,False,False,...,7,2023-07-20 11:35:00,m,er723,sep723,723,active,0,departure,340.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,9p,fjl,False,False,False,False,True,False,False,False,...,11,2023-11-16 09:50:00,Unknown,9p841,fjl841,841,active,0,departure,13.0
51568,sv,sva,False,False,False,False,False,False,False,False,...,11,2023-11-16 11:40:00,m,sv735,sva735,735,active,0,departure,8.0
51569,pk,pia,False,False,False,False,False,False,False,True,...,11,2023-11-27 10:50:00,m,pk203,pia203,203,active,0,departure,225.0
51570,pk,pia,False,False,False,False,False,False,False,True,...,11,2023-11-19 02:00:00,m,pk898,pia898,898,active,0,departure,540.0


In [30]:
data.isnull().sum()

airline.iataCode                              29
airline.icaoCode                               0
airline_airblue                                0
airline_airsial                                0
airline_british airways                        0
airline_emirates                               0
airline_flyjinnah                              0
airline_klm                                    0
airline_oman air                               0
airline_pakistan international airlines        0
airline_qatar airways                          0
airline_serene air                             0
arrival.estimatedTime                          0
arrival.iataCode                               0
arrival.icaoCode                               0
arrival.scheduledTime                          0
departure.actualRunway                     20582
departure.actualTime                           0
departure.day_of_week_Monday                   0
departure.day_of_week_Saturday                 0
departure.day_of_wee

In [34]:
data_conc=pd.read_csv('final_cleaned_flight_data_with_weather.csv')
data_conc

Unnamed: 0,airline.iataCode,airline.icaoCode,airline_airblue,airline_airsial,airline_british airways,airline_emirates,airline_flyjinnah,airline_klm,airline_oman air,airline_pakistan international airlines,...,status_encoded,type,departure.delay_minutes,Time,Temperature (°F),Dew Point (°F),Humidity (%),Wind Speed (mph),Pressure (in),Precipitation (in)
0,pk,pia,False,False,False,False,False,False,False,True,...,0,departure,35.0,,,,,,,
1,tk,thy,False,False,False,False,False,False,False,False,...,0,departure,35.0,,,,,,,
2,pk,pia,False,False,False,False,False,False,False,True,...,0,departure,0.0,,,,,,,
3,xy,kne,False,False,False,False,False,False,False,False,...,0,departure,-3.0,,,,,,,
4,ey,etd,False,False,False,False,False,False,False,False,...,0,departure,-3.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51567,xy,kne,False,False,False,False,False,False,False,False,...,0,departure,2.0,,,,,,,
51568,ey,etd,False,False,False,False,False,False,False,False,...,0,departure,0.0,,,,,,,
51569,kl,klm,False,False,False,False,False,True,False,False,...,0,departure,37.0,,,,,,,
51570,kl,klm,False,False,False,False,False,True,False,False,...,0,departure,4.0,,,,,,,
