In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [36]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data"


In [37]:
columns = ['erythema', 'scaling', 'definite_borders', 'itching', 'koebner_phenomenon',
           'polygonal_papules', 'follicular_papules', 'oral_mucosal_involvement',
           'knee_elbow_involvement', 'scalp_involvement', 'family_history', 'age',
           'melanin_incontinence', 'eosinophils_infiltrate', 'PNL_infiltrate',
           'fibrosis_papillary_dermis', 'exocytosis', 'acanthosis', 'hyperkeratosis',
           'parakeratosis', 'clubbing_rete_ridges', 'elongation_rete_ridges',
           'thinning_suprapapillary_epidermis', 'spongiform_pustule',
           'munro_microabcess', 'focal_hypergranulosis', 'disappearance_granular_layer',
           'vacuolisation_damage_basal_layer', 'spongiosis', 'saw_tooth_appearance_retes',
           'follicular_horn_plug', 'perifollicular_parakeratosis',
           'inflammatory_mononuclear_infiltrate', 'band_like_infiltrate', 'class']


In [38]:
df = pd.read_csv(url, names = columns)

In [39]:
print(f"Shape: {df.shape} (rows × columns)")
print(df.head())
missing_count = (df == '?').sum().sum()
print(f"Total missing values found: {missing_count}")

Shape: (366, 35) (rows × columns)
   erythema  scaling  definite_borders  itching  koebner_phenomenon  \
0         2        2                 0        3                   0   
1         3        3                 3        2                   1   
2         2        1                 2        3                   1   
3         2        2                 2        0                   0   
4         2        3                 2        2                   2   

   polygonal_papules  follicular_papules  oral_mucosal_involvement  \
0                  0                   0                         0   
1                  0                   0                         0   
2                  3                   0                         3   
3                  0                   0                         0   
4                  2                   0                         2   

   knee_elbow_involvement  scalp_involvement  ...  focal_hypergranulosis  \
0                       1                 

In [40]:
df = df.replace('?',np.nan)
df = df.apply(pd.to_numeric)
df = df.dropna()

In [41]:
x = df.drop('class',axis=1)
y = df['class']

In [42]:
class_dist = df['class'].value_counts().sort_index()
max_class = class_dist.max()
min_class = class_dist.min()
balance_ratio = max_class / min_class
print(f"\nBalance Ratio (max/min): {balance_ratio:.2f}")
if balance_ratio < 2:
    print("✓ Dataset is reasonably balanced")
else:
    print("⚠ Dataset is imbalanced - consider using class weights")


Balance Ratio (max/min): 5.55
⚠ Dataset is imbalanced - consider using class weights


In [43]:
x_train, x_test, y_train, y_split = train_test_split(x,y,test_size =0.2,random_state=42,stratify=y)

In [44]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train.values)
x_test_scaled = scaler.transform(x_test.values)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [46]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100,random_state = 42,class_weight='balanced'),
    'SVM': SVC(kernel ='rbf',random_state =42,class_weight='balanced')
}

In [47]:
results = {}
for name, model in models.items():
  model.fit(x_train_scaled,y_train)
  y_pred = model.predict(x_test_scaled)
  accuracy = accuracy_score(y_split,y_pred)
  results[name] = {
      'model':model,
      'predictions':y_pred,
      'accuracy': accuracy
  }
  print(f"{name}: {accuracy: .4f}")


Random Forest:  0.9861
SVM:  0.9861


In [48]:
for name, res in results.items():
    print(f"{name:<20} {res['accuracy']:<10.4f}")

# Select best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
best_pred = results[best_model_name]['predictions']

Random Forest        0.9861    
SVM                  0.9861    


In [49]:
disease_names = [
    'Psoriasis',
    'Seborrheic dermatitis',
    'Lichen planus',
    'Pityriasis rosea',
    'Chronic dermatitis',
    'Pityriasis rubra pilaris'
]


In [50]:
print(f'--Classification Report ({best_model_name}---)')
print(classification_report(y_split,best_pred,target_names = disease_names))

--Classification Report (Random Forest---)
                          precision    recall  f1-score   support

               Psoriasis       1.00      1.00      1.00        22
   Seborrheic dermatitis       1.00      0.92      0.96        12
           Lichen planus       1.00      1.00      1.00        14
        Pityriasis rosea       1.00      1.00      1.00        10
      Chronic dermatitis       1.00      1.00      1.00        10
Pityriasis rubra pilaris       0.80      1.00      0.89         4

                accuracy                           0.99        72
               macro avg       0.97      0.99      0.97        72
            weighted avg       0.99      0.99      0.99        72



In [51]:
cm = confusion_matrix(y_split, best_pred)
print("\nRows = Actual, Columns = Predicted")
print(pd.DataFrame(cm,
                   index=[f'Act-{i+1}' for i in range(6)],
                   columns=[f'Pred-{i+1}' for i in range(6)]))

print("\n" + "="*80)


Rows = Actual, Columns = Predicted
       Pred-1  Pred-2  Pred-3  Pred-4  Pred-5  Pred-6
Act-1      22       0       0       0       0       0
Act-2       0      11       0       0       0       1
Act-3       0       0      14       0       0       0
Act-4       0       0       0      10       0       0
Act-5       0       0       0       0      10       0
Act-6       0       0       0       0       0       4



In [52]:
from sklearn.model_selection import cross_val_score

for name, res in results.items():
    cv_scores = cross_val_score(
        res['model'],
        x_train_scaled,
        y_train,
        cv=10,
        scoring='accuracy'
    )
    print(f"\n{name}:")
    print(f"  CV Scores: {[f'{s:.3f}' for s in cv_scores]}")
    print(f"  Mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Random Forest:
  CV Scores: ['1.000', '0.966', '0.931', '1.000', '0.966', '0.966', '0.964', '0.964', '0.929', '0.964']
  Mean: 0.9649 ± 0.0222

SVM:
  CV Scores: ['1.000', '0.966', '1.000', '1.000', '0.966', '0.931', '0.929', '0.964', '0.964', '0.964']
  Mean: 0.9683 ± 0.0246


In [53]:
if 'Random Forest' in results:
  rf_model = results['Random Forest']['model']

In [54]:
feature_importance = pd.DataFrame({
    'Feature': x.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending = False)

In [55]:
print("\nTop 10 Most Important Features:")
print("-" * 50)
for idx, row in feature_importance.head(10).iterrows():
    bar_length = int(row['Importance'] * 50)
    bar = '█' * bar_length
    print(f"{row['Feature']:<35} {bar} {row['Importance']:.4f}")

print("\n These features are most useful for diagnosis!")


Top 10 Most Important Features:
--------------------------------------------------
PNL_infiltrate                      ████ 0.0947
vacuolisation_damage_basal_layer    ██ 0.0593
follicular_horn_plug                ██ 0.0572
koebner_phenomenon                  ██ 0.0562
elongation_rete_ridges              ██ 0.0553
disappearance_granular_layer        ██ 0.0528
parakeratosis                       ██ 0.0489
saw_tooth_appearance_retes          ██ 0.0460
clubbing_rete_ridges                ██ 0.0429
follicular_papules                  ██ 0.0412

 These features are most useful for diagnosis!


In [56]:
rf_param_grid = {
    'n_estimators': [50,100,150],
    'max_depth': [10,20,None],
    'min_samples_split': [2,5],
    'class_weight': ['balanced']
}

In [57]:
from sklearn.model_selection import GridSearchCV
rf_grid = GridSearchCV (
    RandomForestClassifier(random_state=42),
    rf_param_grid,cv=5,scoring="accuracy",n_jobs=-1,verbose=0
)

In [58]:
rf_grid.fit(x_train_scaled, y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'class_weight': ['balanced'], 'max_depth': [10, 20, ...], 'min_samples_split': [2, 5], 'n_estimators': [50, 100, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [59]:
print(f"\nBest parameters:")
for param, value in rf_grid.best_params_.items():
    print(f"  {param}: {value}")


Best parameters:
  class_weight: balanced
  max_depth: 20
  min_samples_split: 2
  n_estimators: 150


In [60]:
rf_tuned_score = rf_grid.score(x_test_scaled, y_split)
print(f"\nBefore tuning: {results['Random Forest']['accuracy']:.4f}")
print(f"After tuning:  {rf_tuned_score:.4f}")
print(f"Improvement:   {rf_tuned_score - results['Random Forest']['accuracy']:+.4f}")



Before tuning: 0.9861
After tuning:  0.9861
Improvement:   +0.0000


In [61]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.001, 0.01],
    'class_weight': ['balanced']
}

svm_grid = GridSearchCV(
    SVC(kernel='rbf', random_state=42),
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=0
)

In [62]:
svm_grid.fit(x_train_scaled, y_train)

0,1,2
,estimator,SVC(random_state=42)
,param_grid,"{'C': [0.1, 1, ...], 'class_weight': ['balanced'], 'gamma': ['scale', 0.001, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [63]:
for param, value in svm_grid.best_params_.items():
    print(f"  {param}: {value}")

  C: 1
  class_weight: balanced
  gamma: scale


In [64]:
svm_tuned_score = svm_grid.score(x_test_scaled, y_split)
print(f"\nBefore tuning: {results['SVM']['accuracy']:.4f}")
print(f"After tuning:  {svm_tuned_score:.4f}")
print(f"Improvement:   {svm_tuned_score - results['SVM']['accuracy']:+.4f}")

print("\n" + "="*80)


Before tuning: 0.9861
After tuning:  0.9861
Improvement:   +0.0000



In [65]:
if rf_tuned_score > svm_tuned_score:
    final_model = rf_grid.best_estimator_
    final_name = "Random Forest"
    final_score = rf_tuned_score
else:
    final_model = svm_grid.best_estimator_
    final_name = "SVM"
    final_score = svm_tuned_score

print(f"\n🏆 FINAL BEST MODEL: {final_name}")
print(f"Test Accuracy: {final_score:.4f}")


🏆 FINAL BEST MODEL: SVM
Test Accuracy: 0.9861


In [66]:
final_pred = final_model.predict(x_test_scaled)

In [67]:
print("\n--- Final Classification Report ---")
print(classification_report(y_split, final_pred, target_names=disease_names))



--- Final Classification Report ---
                          precision    recall  f1-score   support

               Psoriasis       1.00      1.00      1.00        22
   Seborrheic dermatitis       1.00      0.92      0.96        12
           Lichen planus       1.00      1.00      1.00        14
        Pityriasis rosea       0.91      1.00      0.95        10
      Chronic dermatitis       1.00      1.00      1.00        10
Pityriasis rubra pilaris       1.00      1.00      1.00         4

                accuracy                           0.99        72
               macro avg       0.98      0.99      0.98        72
            weighted avg       0.99      0.99      0.99        72



In [68]:
import joblib

# Save model and scaler
joblib.dump(final_model, 'skin_disease_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Later, to load and use:
model = joblib.load('skin_disease_model.pkl')
scaler = joblib.load('scaler.pkl')

# Make prediction for new patient
# Replace the list of zeros below with the actual 34 feature values for a new patient
new_patient = [[0] * 34]
scaled_data = scaler.transform(new_patient)
prediction = model.predict(scaled_data)
print(f"Predicted disease: {disease_names[prediction[0]-1]}")

Predicted disease: Pityriasis rosea
