In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('16_data.csv')
df.head()

Unnamed: 0,age,surgery_year,positive_nodes,target
0,30,1964,1,0
1,30,1962,3,0
2,30,1965,0,0
3,31,1959,2,0
4,31,1965,4,0


In [3]:
df['target'].value_counts()

target
0    225
1     81
Name: count, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 1

df_train, df_test = train_test_split(df, stratify=df['target'], test_size=0.2, random_state=RANDOM_SEED)

In [5]:
from sklearn.utils import resample

df_min = df_train[df_train['target'] == 1]
df_maj = df_train[df_train['target'] == 0]

df_min_upsample = resample(df_min, replace=True, n_samples=len(df_maj), random_state=RANDOM_SEED)

print(df_min_upsample.shape)
print(df_maj.shape)


(179, 4)
(179, 4)


In [6]:
df_1 = pd.concat([df_min_upsample, df_maj], ignore_index=True).sample(frac=1.)
df_1

Unnamed: 0,age,surgery_year,positive_nodes,target
113,66,1961,13,1
144,74,1965,3,1
105,57,1964,1,1
301,53,1960,1,0
333,60,1961,25,0
...,...,...,...,...
1,60,1965,0,1
309,47,1966,0,0
317,49,1961,0,0
80,54,1968,7,1


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = df_1[features], df_1[target]
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.5706521739130435


In [8]:
df_maj_downsample = resample(df_maj, replace=False, n_samples=len(df_min), random_state=RANDOM_SEED)

print(df_maj_downsample.shape)
print(df_min.shape)

(65, 4)
(65, 4)


In [9]:
df_2 = pd.concat([df_maj_downsample, df_min], ignore_index=True).sample(frac=1.)

X_train, y_train = df_2[features], df_2[target]
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.6114130434782609


## Видео 2

In [10]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable


In [11]:
df_train.shape

(244, 4)

In [12]:
df_test.shape

(62, 4)

In [13]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=RANDOM_SEED, k_neighbors=2)

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = os.fit_resample(df_train[features], df_train[target])

In [14]:
y_train.value_counts()

target
0    179
1    179
Name: count, dtype: int64

In [15]:
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.6222826086956521


## Видео 3

In [16]:
df_train[df_train['target'] == 0].shape

(179, 4)

In [17]:
df_train[df_train['target'] == 1].shape

(65, 4)

In [18]:
class_weights = {
    0: 1,
    1: (df_train[df_train['target'] == 0].shape[0] / df_train[df_train['target'] == 1].shape[0]) ** 3
}

In [19]:
features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'

X_train, y_train = df_train[features], df_train[target]

model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED, class_weight=class_weights)

model.fit(X_train, y_train)

X_test, y_test = df_test[features], df_test[target]
print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.6535326086956522


## Видео 4

In [20]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

models = []
metrics = []

features = ['age', 'surgery_year', 'positive_nodes']
target = 'target'
X, y = df[features], df[target]

for train_index, test_index in kf.split(X, y):
  X_train, y_train = X.values[train_index], y.values[train_index]
  X_test, y_test = X.values[test_index], y.values[test_index]

  model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=RANDOM_SEED, class_weight='balanced')

  model.fit(X_train, y_train)
  score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
  print(score)

  models.append(model)
  metrics.append(score)

0.5390123456790124
0.6227160493827161
0.7066666666666667


In [21]:
sum(metrics) / len(metrics)

0.6227983539094649