In [26]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [27]:
df_encoded = pd.read_csv('Thyroid_Diff_Encoded.csv')

In [28]:
df_encoded.isnull().sum()

Age                                                 0
Gender                                              0
Smoking                                             0
Hx Smoking                                          0
Hx Radiothreapy                                     0
Focality                                            0
Risk                                                0
Response                                            0
Recurred                                            0
Thyroid Function_Clinical Hypothyroidism            0
Thyroid Function_Euthyroid                          0
Thyroid Function_Subclinical Hyperthyroidism        0
Thyroid Function_Subclinical Hypothyroidism         0
Physical Examination_Multinodular goiter            0
Physical Examination_Normal                         0
Physical Examination_Single nodular goiter-left     0
Physical Examination_Single nodular goiter-right    0
Adenopathy_Extensive                                0
Adenopathy_Left             

In [29]:
X = df_encoded.drop('Recurred', axis=1) 
y = df_encoded['Recurred'] 

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.10,
    random_state=42,
    stratify=y  
)

In [43]:
print(f"Eğitim seti boyutu: {X_train.shape}")
print(f"Test seti boyutu: {X_test.shape}")
print(f"\nEğitim setindeki 'Recurred' dağılımı:\n{y_train.value_counts(normalize=True)}")
print(f"\nTest setindeki 'Recurred' dağılımı:\n{y_test.value_counts(normalize=True)}")

Eğitim seti boyutu: (344, 37)
Test seti boyutu: (39, 37)

Eğitim setindeki 'Recurred' dağılımı:
Recurred
0    0.718023
1    0.281977
Name: proportion, dtype: float64

Test setindeki 'Recurred' dağılımı:
Recurred
0    0.717949
1    0.282051
Name: proportion, dtype: float64


In [44]:
from imblearn.over_sampling import SMOTE

In [45]:
smote = SMOTE(random_state=42)

In [46]:
print("\n--- SMOTE Öncesi Eğitim Verisi Dağılımı ---")
print(y_train.value_counts())


--- SMOTE Öncesi Eğitim Verisi Dağılımı ---
Recurred
0    247
1     97
Name: count, dtype: int64


In [47]:
y_train.isnull().sum()  # Null değer kontrolü

np.int64(0)

In [48]:
X_train.isnull().sum()  # Null değer kontrolü

Age                                                 0
Gender                                              0
Smoking                                             0
Hx Smoking                                          0
Hx Radiothreapy                                     0
Focality                                            0
Risk                                                0
Response                                            0
Thyroid Function_Clinical Hypothyroidism            0
Thyroid Function_Euthyroid                          0
Thyroid Function_Subclinical Hyperthyroidism        0
Thyroid Function_Subclinical Hypothyroidism         0
Physical Examination_Multinodular goiter            0
Physical Examination_Normal                         0
Physical Examination_Single nodular goiter-left     0
Physical Examination_Single nodular goiter-right    0
Adenopathy_Extensive                                0
Adenopathy_Left                                     0
Adenopathy_No               

In [49]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [50]:
print("\n--- SMOTE Sonrası Eğitim Verisi Dağılımı ---")
print(y_train_resampled.value_counts())


--- SMOTE Sonrası Eğitim Verisi Dağılımı ---
Recurred
0    247
1    247
Name: count, dtype: int64


In [51]:
from sklearn.preprocessing import StandardScaler

In [52]:
scaler = StandardScaler()

# 1. Scaler'ı SADECE dengelenmiş eğitim verisi üzerinde eğit ve uygula
X_train_scaled = scaler.fit_transform(X_train_resampled)

# 2. Aynı scaler'ı test verisine SADECE uygula (tekrar eğitme!)
X_test_scaled = scaler.transform(X_test)

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Modeli Seç ve Kur
model = RandomForestClassifier(random_state=42, n_estimators=100) # 100 ağaçlı bir orman

# 2. Modeli Eğit
# Hazırladığımız, dengelenmiş ve ölçeklenmiş eğitim verisini kullanıyoruz.
model.fit(X_train_scaled, y_train_resampled)

# 3. Modeli Test Et
# Eğitilmiş modeli, daha önce hiç görmediği test verisiyle test ediyoruz.
y_pred = model.predict(X_test_scaled)

# 4. Sonuçları Değerlendir
print("\n--- Model Performansı ---")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Performansı ---
Accuracy Score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        11

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39

