In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('16_data.csv')
df.head()

Unnamed: 0,age,surgery_year,positive_nodes,target
0,30,1964,1,0
1,30,1962,3,0
2,30,1965,0,0
3,31,1959,2,0
4,31,1965,4,0


In [None]:
df['target'].value_counts()

0    225
1     81
Name: target, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 1

df_train, df_test = train_test_split(df, stratify=df['target'], test_size=0.2, random_state=RANDOM_SEED)

In [None]:
from sklearn.utils import resample

df_min = df_train[df_train['target'] == 1]
df_maj = df_train[df_train['target'] == 0]

df_min_upsample = resample(df_min, replace=True, n_samples=len(df_maj), random_state=RANDOM_SEED)

print(df_min_upsample.shape)
print(df_maj.shape)


(179, 4)
(179, 4)


In [None]:
df_1 = pd.concat([df_min_upsample, df_maj], ignore_index=True).sample(frac=1.)
df_1

Unnamed: 0,age,surgery_year,positive_nodes,target
156,53,1959,3,1
136,53,1965,1,1
221,51,1959,1,0
196,49,1966,0,0
330,42,1960,1,0
...,...,...,...,...
213,47,1960,4,0
45,55,1968,15,1
18,43,1958,52,1
345,61,1959,0,0


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = df_1[features], df_1[target]
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.5876358695652174


In [None]:
df_maj_downsample = resample(df_maj, replace=False, n_samples=len(df_min), random_state=RANDOM_SEED)

print(df_maj_downsample.shape)
print(df_min.shape)

(65, 4)
(65, 4)


In [None]:
df_2 = pd.concat([df_maj_downsample, df_min], ignore_index=True).sample(frac=1.)

X_train, y_train = df_2[features], df_2[target]
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.5665760869565217


## Видео 2

In [None]:
!pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df_train.shape

(244, 4)

In [None]:
df_test.shape

(62, 4)

In [None]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=RANDOM_SEED, k_neighbors=2)

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = os.fit_resample(df_train[features], df_train[target])

In [None]:
y_train.value_counts()

0    179
1    179
Name: target, dtype: int64

In [None]:
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.6331521739130435


## Видео 3

In [None]:
df_train[df_train['target'] == 0].shape

(179, 4)

In [None]:
df_train[df_train['target'] == 1].shape

(65, 4)

In [None]:
class_weights = {
    0: 1,
    1: (df_train[df_train['target'] == 0].shape[0] / df_train[df_train['target'] == 1].shape[0]) ** 3
}

In [None]:
features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = df_train[features], df_train[target]

model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED, class_weight=class_weights)

model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.6535326086956522


## Видео 4

In [None]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

models = []
metrics = []

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'
X, y = df[features], df[target]

for train_index, test_index in kf.split(X, y):
  X_train, y_train = X.values[train_index], y.values[train_index]
  X_test, y_test = X.values[test_index], y.values[test_index]

  model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED, class_weight='balanced')

  model.fit(X_train, y_train)
  score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
  print(score)

  models.append(model)
  metrics.append(score)

0.5390123456790124
0.6227160493827161
0.7066666666666667


In [None]:
sum(metrics) / len(metrics)

0.6227983539094649