# Importing libraries and dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

data = pd.read_excel('Acoustic_Extinguisher_Fire_Dataset.xlsx')

# Explorative Data Analysis

In [None]:
def plot_histograms(data):
    numeric_variables = data.select_dtypes(include=['number']).columns.tolist()
    categorical_variables = [i for i in data.columns if i not in numeric_variables]
    
    fig, axs = plt.subplots(1, len(numeric_variables), figsize=(15, 5))
    if len(numeric_variables) == 1:  
        axs = [axs]
    
    for ax, var in zip(axs, numeric_variables):
        data[var].plot(kind='hist', ax=ax, bins=20, color='tab:blue', alpha=0.7)
        ax.set_title(f'Histogram of {var}')
        ax.set_xlabel(var)
        ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

    
    fig, axs = plt.subplots(1, len(categorical_variables), figsize=(15, 5))
    if len(categorical_variables) == 1:  
        axs = [axs]
    
    for ax, var in zip(axs, categorical_variables):
        
        value_counts = data[var].fillna('Missing').value_counts()
        value_counts.plot(kind='bar', ax=ax, color='tab:orange', alpha=0.7)
        ax.set_title(f'Bar Plot of {var}')
        ax.set_xlabel(var)
        ax.set_ylabel('Frequency')
        
        
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

plot_histograms(data)


# Feature Engineering

In [3]:
fuel_mapping = {
    'gasoline': 1,
    'kerosene': 2,
    'lpg': 3,
    'thinner': 4
}

data['FUEL_encoded'] = data['FUEL'].map(fuel_mapping)

data = data.drop('FUEL', axis=1)

## Train-Test Split

In [None]:
X = data.drop('STATUS', axis=1)
y = data['STATUS']

X.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train.head()

## Target Balance

In [None]:
(y_train == 0).sum(), (y_train == 1).sum()

In [None]:
plt.figure(figsize=(6, 4))
value_counts = y_train.value_counts()
ax = value_counts.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Histogram of the target variable')
plt.xticks([0, 1], ['0', '1'], rotation=0)  # Set x-tick labels as 0 and 1

# Add count on top of each bar
for index, value in enumerate(value_counts):
    ax.text(index, value + 0.1, str(value), ha='center', va='bottom')

plt.show()

## Scaling/Transformations

In [8]:
scaler = StandardScaler()


X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

In [None]:
X_train.head()

In [10]:
stats = pd.DataFrame([X_train.columns, scaler.mean_, scaler.var_])
stats = stats.T
stats.columns = ['Feature', 'Mean', 'Variance']
stats.to_csv('stats.csv', index = True)

# Feature Selection

### Correlation Values

In [None]:
numeric_variables = X_train.select_dtypes(include=['number']).columns

correlation_matrix = X_train[numeric_variables].corr(method='pearson')

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Pearson Correlation Matrix for All Numeric Variables')
plt.show()


There isn't any pair of features with a significantly high Pearson correlation value in order to justify removal

### RFE

In [None]:
model = LogisticRegression(max_iter=1000)

cv = StratifiedKFold(5)

rfecv = RFECV(estimator=model, step=1, cv=cv, scoring='accuracy')

rfecv.fit(X_train , y_train)

selected_features = X.columns[rfecv.support_]

print("Optimal number of features:", rfecv.n_features_)

X_train = X_train[selected_features]
X_test = X_test[selected_features]

# Exporting the final dataframes

In [13]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

dataTrain = pd.concat([X_train, y_train], axis=1)
dataTest = pd.concat([X_test, y_test], axis=1)

dataTrain.to_csv('dataTrain.csv', index=False)
dataTest.to_csv('dataTest.csv', index=False)