In [9]:
import pandas as pd
file_path = "/content/dermatology_database_1.csv"
df = pd.read_csv(file_path)
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   erythema                             366 non-null    int64 
 1   scaling                              366 non-null    int64 
 2   definite_borders                     366 non-null    int64 
 3   itching                              366 non-null    int64 
 4   koebner_phenomenon                   366 non-null    int64 
 5   polygonal_papules                    366 non-null    int64 
 6   follicular_papules                   366 non-null    int64 
 7   oral_mucosal_involvement             366 non-null    int64 
 8   knee_and_elbow_involvement           366 non-null    int64 
 9   scalp_involvement                    366 non-null    int64 
 10  family_history                       366 non-null    int64 
 11  melanin_incontinence                 366 non-

(None,
    erythema  scaling  definite_borders  itching  koebner_phenomenon  \
 0         2        2                 0        3                   0   
 1         3        3                 3        2                   1   
 2         2        1                 2        3                   1   
 3         2        2                 2        0                   0   
 4         2        3                 2        2                   2   
 
    polygonal_papules  follicular_papules  oral_mucosal_involvement  \
 0                  0                   0                         0   
 1                  0                   0                         0   
 2                  3                   0                         3   
 3                  0                   0                         0   
 4                  2                   0                         2   
 
    knee_and_elbow_involvement  scalp_involvement  ...  \
 0                           1                  0  ...   
 1             

In [10]:
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"].fillna(df["age"].median(), inplace=True)
X = df.drop(columns=["class"])
y = df["class"]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_preds)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)

svm_acc, rf_acc


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)


(0.972972972972973, 0.9594594594594594)

In [11]:
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
X_clean = X[~outlier_mask]
y_clean = y[~outlier_mask]
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)
svm_model.fit(X_train_clean, y_train_clean)
svm_preds_clean = svm_model.predict(X_test_clean)
svm_acc_clean = accuracy_score(y_test_clean, svm_preds_clean)

rf_model.fit(X_train_clean, y_train_clean)
rf_preds_clean = rf_model.predict(X_test_clean)
rf_acc_clean = accuracy_score(y_test_clean, rf_preds_clean)

svm_acc_clean, rf_acc_clean


(1.0, 1.0)

In [12]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=None
)
svm_model.fit(X_train_clean, y_train_clean)
svm_preds_clean = svm_model.predict(X_test_clean)
svm_acc_clean = accuracy_score(y_test_clean, svm_preds_clean)

rf_model.fit(X_train_clean, y_train_clean)
rf_preds_clean = rf_model.predict(X_test_clean)
rf_acc_clean = accuracy_score(y_test_clean, rf_preds_clean)

svm_acc_clean, rf_acc_clean


(1.0, 1.0)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=None
)
svm_model.fit(X_train_clean, y_train_clean)
svm_preds_clean = svm_model.predict(X_test_clean)
svm_acc_clean = accuracy_score(y_test_clean, svm_preds_clean)

rf_model.fit(X_train_clean, y_train_clean)
rf_preds_clean = rf_model.predict(X_test_clean)
rf_acc_clean = accuracy_score(y_test_clean, rf_preds_clean)

print("SVM Accuracy after removing outliers:", svm_acc_clean)
print("RF Accuracy after removing outliers:", rf_acc_clean)


SVM Accuracy after removing outliers: 1.0
RF Accuracy after removing outliers: 1.0
