In [29]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

In [31]:
# Load the dataset
data = pd.read_csv("C:\\Users\\surut\\OneDrive\\Documents\\DataScience\\Testing.csv")
data.head()

Unnamed: 0,itching,skin_rash,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,prognosis
0,1,1,0,0,0,0,0,0,Fungal infection
1,0,0,1,1,1,0,0,0,Allergy
2,0,0,0,0,0,0,1,1,GERD
3,1,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,1,0,Drug Reaction


In [41]:
# Preprocessing
# Assume 'prognosis' is the target variable and the rest are features
X = data.drop('prognosis', axis=1)
y = data['itching']

In [43]:
# Handle missing values by replacing them with 0 (or use an appropriate strategy)
X.fillna(0, inplace=True)

# Since 'prognosis' is categorical, we'll one-hot encode it later
# If all the symptom columns are numeric (e.g., 0/1 for presence/absence), we only need to scale them
numeric_cols = X.columns

In [45]:
# Preprocess the data: Scale numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols)
    ])

In [47]:
# Create a pipeline to streamline preprocessing and model training
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Predictions
y_pred = model_pipeline.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [49]:
# Detailed report
print(classification_report(y_test, y_pred))

# AUC-ROC Score (Only for binary classification; otherwise, this will need adjustment)
try:
    roc_auc = roc_auc_score(y_test, model_pipeline.predict_proba(X_test)[:, 1])
    print(f'AUC-ROC Score: {roc_auc:.2f}')
except:
    print("AUC-ROC Score is only applicable for binary classification.")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         1

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13

AUC-ROC Score: 1.00
