In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [2]:
# Load the dataset
file_path = 'generated_training_dataset.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,gender,description,provided_symptoms,priority_status
0,Female,"I was at work and suddenly felt bleeding, indi...","['bleeding', 'indigestion', 'vomiting', 'broke...",non-priority
1,Female,I was at the gym when I started to feel broken...,"['broken arm', 'vomiting', 'bleeding']",non-priority
2,Female,"I was driving when I started to feel jaw pain,...","['jaw pain', 'nausea', 'broken arm']",non-priority
3,Female,"While walking, I experienced nausea, bleeding,...","['nausea', 'bleeding', 'broken leg', 'broken a...",non-priority
4,Male,"I was sitting at home when I felt indigestion,...","['indigestion', 'broken arm', 'sweating']",non-priority


In [4]:
df['priority_status'].value_counts()

priority_status
non-priority    300
priority        150
Name: count, dtype: int64

# Preprocess

In [6]:
# Combine description and provided_symptoms into a single text feature
df['text'] = df['description'] + ' ' + df['provided_symptoms'].apply(lambda x: ' '.join(eval(x)))

# Encode the categorical features
label_encoder = LabelEncoder()
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])
df['priority_status_encoded'] = label_encoder.fit_transform(df['priority_status'])

# Split the data into training and testing sets
X = df[['text', 'gender_encoded']]
y = df['priority_status_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize the text data and combine with gender using a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression())
])

# Train the model
pipeline.fit(X_train['text'], y_train)

# Predict the priority_status for the test set
y_pred = pipeline.predict(X_test['text'])

# Evaluate Results

In [9]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94        92
           1       1.00      0.74      0.85        43

    accuracy                           0.92       135
   macro avg       0.95      0.87      0.90       135
weighted avg       0.93      0.92      0.91       135



In [13]:
# Confusion Matrix divided by gender
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
# Creating a DataFrame for the confusion matrix with labels
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['Actual Non-Priority', 'Actual Priority'], 
                              columns=['Predicted Non-Priority', 'Predicted Priority'])

print("Confusion Matrix:\n", conf_matrix_df)

Confusion Matrix:
                      Predicted Non-Priority  Predicted Priority
Actual Non-Priority                      92                   0
Actual Priority                          11                  32


In [15]:
# Confusion matrix by gender
y_test_female = y_test[X_test['gender_encoded'] == 0]
y_pred_female = y_pred[X_test['gender_encoded'] == 0]
y_test_male = y_test[X_test['gender_encoded'] == 1]
y_pred_male = y_pred[X_test['gender_encoded'] == 1]

conf_matrix_female = confusion_matrix(y_test_female, y_pred_female)
conf_matrix_male = confusion_matrix(y_test_male, y_pred_male)

In [16]:
conf_matrix_female_df = pd.DataFrame(conf_matrix_female, 
                                     index=['Actual Non-Priority', 'Actual Priority'], 
                                     columns=['Predicted Non-Priority', 'Predicted Priority'])
print("Confusion Matrix for Females:\n", conf_matrix_female_df)

Confusion Matrix for Females:
                      Predicted Non-Priority  Predicted Priority
Actual Non-Priority                      43                   0
Actual Priority                          11                   7


In [17]:
conf_matrix_male_df = pd.DataFrame(conf_matrix_male, 
                                   index=['Actual Non-Priority', 'Actual Priority'], 
                                   columns=['Predicted Non-Priority', 'Predicted Priority'])
print("\nConfusion Matrix for Males:\n", conf_matrix_male_df)


Confusion Matrix for Males:
                      Predicted Non-Priority  Predicted Priority
Actual Non-Priority                      49                   0
Actual Priority                           0                  25
