### Importing necessary Machine Learning modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt


### Loading the Datasets from path

In [2]:

accidents_df = pd.read_csv("AccidentsData.csv")
health_facilities_df = pd.read_excel("kenya-health-facilities-2017_08_02.xlsx")


FileNotFoundError: [Errno 2] No such file or directory: 'AccidentsData.csv'

### Display the first few rows of the datasets
#### Overview of the data and its structure

In [None]:

print(accidents_df.head())
print(health_facilities_df.head())


: 

### Lable encoding the target variable "GENDER"
### One-hot encoding the categorical columns with 'object' datatype

In [None]:
# Lable Encode the target variable in the accidents dataset
label_encoder = LabelEncoder()
accidents_df['GENDER'] = label_encoder.fit_transform(accidents_df['GENDER'])

# Identifing categorical columns for one-hot encoding
categorical_columns = accidents_df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns as they are
)


: 

### Defining Feature and Target

In [None]:

# Separate features and target in the accidents dataset
X_accidents = accidents_df.drop('GENDER', axis=1)
y_accidents = accidents_df['GENDER']

# Apply the column transformer to the features
X_accidents_encoded = column_transformer.fit_transform(X_accidents)

: 

#### splitting data set into training data and test data

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_accidents_encoded, y_accidents, test_size=0.2, random_state=42)


: 

### Initialize and train the model 

In [None]:

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


: 

### Model Evaluation

In [None]:

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


: 

### Confusion Matrix plot

In [None]:

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

: 

### Merging AccidentsDataset and Health_facilities datasets 

In [None]:
# Aggregate accident data by COUNTY to find the number of accidents per county
accidents_by_county = accidents_df['COUNTY'].value_counts().reset_index()
accidents_by_county.columns = ['COUNTY', 'Number_of_Accidents']

# Aggregate health facilities by county to get the number of facilities per county
health_facilities_by_county = health_facilities_df['County'].value_counts().reset_index()
health_facilities_by_county.columns = ['COUNTY', 'Number_of_Health_Facilities']

# Merge the aggregated accident data with the health facilities data
combined_df = pd.merge(accidents_by_county, health_facilities_by_county, on='COUNTY', how='left')
# Display the combined data
print(combined_df.head())

: 

### Second data cleaning/ processing 

In [None]:

# Aggregate accident data by COUNTY to find the number of accidents per county
accidents_by_county = accidents_df['COUNTY'].value_counts().reset_index()
accidents_by_county.columns = ['COUNTY', 'Number_of_Accidents']

# Aggregate health facilities by county to get the number of facilities per county
health_facilities_by_county = health_facilities_df['County'].value_counts().reset_index()
health_facilities_by_county.columns = ['COUNTY', 'Number_of_Health_Facilities']

# Merge the aggregated accident data with the health facilities data
combined_df = pd.merge(accidents_by_county, health_facilities_by_county, on='COUNTY', how='left')
# Display the combined data
print(combined_df.head())



: 

In [None]:
# List of columns to remove
columns_to_remove = ['BASE/SUB BASE', 'PLACE', 'MV INVOLVED', 'BRIEF ACCIDENT DETAILS', 'NAME OF VICTIM', 'NO.', 'DATE']

# Remove the specified columns
accidents_df.drop(columns=columns_to_remove, inplace=True)

# Handle missing values
accidents_df.fillna('UNKNOWN', inplace=True)

# Encode the categorical features for pair plot
df_encoded = accidents_df.copy()
categorical_features = accidents_df.select_dtypes(include=['object']).columns

# Encode categorical features using OneHotEncoder
df_encoded = pd.get_dummies(df_encoded, columns=categorical_features)

# Generate the pair plot
sns.pairplot(df_encoded)
plt.show()

: 

In [None]:
# Encode target variable and categorical columns
combined_df['GENDER'] = label_encoder.fit_transform(combined_df['GENDER'])
categorical_columns_combined = combined_df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
column_transformer_combined = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_columns_combined)
    ],
    remainder='passthrough'
)

: 

In [None]:

# Separating features and target
X_combined = combined_df.drop('GENDER', axis=1)
y_combined = combined_df['GENDER']

# and applying column transformer
X_combined_encoded = column_transformer_combined.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined_encoded, y_combined, test_size=0.2, random_state=42)


: 

### Model initialization, training and making predctions

In [None]:
model_combined = RandomForestRegressor(random_state=42)
model_combined.fit(X_train_combined, y_train_combined)

# Make predictions
y_pred_combined = model_combined.predict(X_test_combined)



: 

### Evaluating the model

In [None]:
r2_combined = r2_score(y_test_combined, y_pred_combined)
mae_combined = mean_absolute_error(y_test_combined, y_pred_combined)
print(f'R2 Score (combined): {r2_combined}')
print(f'Mean Absolute Error (combined): {mae_combined}')

: 