# Importing Libraries & load data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../Dataset/GUIDE_Train.csv')  # Load the dataset
df.head(3)

# Handling Missing Values

In [None]:
# Separate categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
categorical_columns.remove('IncidentGrade')

In [None]:
from scipy.stats import f_oneway, chi2_contingency

# ANOVA Test for numerical columns
significant_numerical = []
for num_col in numerical_columns:
    groups = [df[df['IncidentGrade'] == cat][num_col] for cat in df['IncidentGrade'].unique()]
    f_stat, p_val = f_oneway(*groups)
    if p_val < 0.05:  # Significant relationship
        significant_numerical.append(num_col)

# Chi-Square Test for categorical columns
significant_categorical = []
for cat_col in categorical_columns:
    contingency_table = pd.crosstab(df[cat_col], df['IncidentGrade'])
    chi2, p_val, _, _ = chi2_contingency(contingency_table)
    if p_val < 0.05:  # Significant relationship
        significant_categorical.append(cat_col)

In [None]:
df.info()

In [None]:
# Identify missing values
print("Missing Values:\n", df.isnull().sum())

In [None]:
missing_threshold = 0.5  # Remove columns with more than 50% missing values

# 1. Remove columns with a high percentage of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Identify missing values
print("Missing Values:\n", df.isnull().sum())

In [None]:
df.shape