In [66]:
import pandas as pd
import random
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score


In [67]:
# Создаем пустой датафрейм
data = pd.DataFrame(columns=['salary', 'city', 'age', 'vacation_prefer', 'transport_prefer', 'target'])

# Заполняем датасет случайными значениями
for _ in range(1000):
    row = {
        'salary': random.randint(30000, 180000),
        'city': random.choice(['New York', 'Tokyo', 'London', "Bishkek"]),
        'age': random.randint(30, 65),
        'vacation_prefer': random.choice(['Shopping', 'Beach holiday', 'Skiing', 'Cultural', 'Adventure']),
        'transport_prefer': random.choice(['auto', 'plane', "train"]),
        'target': random.choice(['New York', 'Tokyo', 'London', "Bishkek"])
    }
    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)

# Выводим первые строки датасета
print(data.head())

   salary     city age vacation_prefer transport_prefer    target
0   39500  Bishkek  63        Cultural            plane   Bishkek
1  136340  Bishkek  42   Beach holiday             auto   Bishkek
2  119654    Tokyo  60       Adventure             auto     Tokyo
3  108649  Bishkek  38        Shopping             auto     Tokyo
4  158956   London  53          Skiing            train  New York


In [68]:
data_encoded = pd.get_dummies(data, columns=['city', 'vacation_prefer', 'transport_prefer'])

# Выводим первые строки преобразованного датасета
print(data_encoded.head())


   salary age    target  city_Bishkek  city_London  city_New York  city_Tokyo  \
0   39500  63   Bishkek          True        False          False       False   
1  136340  42   Bishkek          True        False          False       False   
2  119654  60     Tokyo         False        False          False        True   
3  108649  38     Tokyo          True        False          False       False   
4  158956  53  New York         False         True          False       False   

   vacation_prefer_Adventure  vacation_prefer_Beach holiday  \
0                      False                          False   
1                      False                           True   
2                       True                          False   
3                      False                          False   
4                      False                          False   

   vacation_prefer_Cultural  vacation_prefer_Shopping  vacation_prefer_Skiing  \
0                      True                     False

In [69]:
from sklearn.model_selection import train_test_split

X = data_encoded.drop('target', axis=1)
y = data_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Выводим размеры выборок
print("Train set size:", len(X_train))
print("Test set size:", len(X_test))


Train set size: 800
Test set size: 200


In [70]:
from sklearn.ensemble import RandomForestClassifier

# Инициализируем модель
model = RandomForestClassifier()


In [71]:
model.fit(X_train, y_train)

cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation F1-Scores:", cross_val_scores)
print("Mean F1-Score:", cross_val_scores.mean())


Cross-Validation F1-Scores: [0.23004651 0.27036602 0.28593407 0.2706581  0.33738514]
Mean F1-Score: 0.27887796827885697


In [72]:

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [73]:
print("Best Parameters:", grid_search.best_params_)

# Предсказание на тестовой выборке
y_pred = grid_search.predict(X_test)

# Оценка качества модели на тестовой выборке с использованием метрики F1-score
f1_test = f1_score(y_test, y_pred, average='macro')
print("Test F1-Score:", f1_test)

# Вывод отчета о классификации
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Test F1-Score: 0.24722852871367723
Classification Report:
               precision    recall  f1-score   support

     Bishkek       0.19      0.12      0.15        66
      London       0.25      0.33      0.28        43
    New York       0.28      0.32      0.30        47
       Tokyo       0.26      0.27      0.26        44

    accuracy                           0.24       200
   macro avg       0.24      0.26      0.25       200
weighted avg       0.24      0.24      0.24       200



In [74]:
from sklearn.metrics import accuracy_score, classification_report

# Предсказание на тестовой выборке
y_pred = model.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Вывод отчета о классификации
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.26
Classification Report:
               precision    recall  f1-score   support

     Bishkek       0.30      0.20      0.24        66
      London       0.27      0.30      0.28        43
    New York       0.29      0.32      0.31        47
       Tokyo       0.19      0.25      0.22        44

    accuracy                           0.26       200
   macro avg       0.26      0.27      0.26       200
weighted avg       0.27      0.26      0.26       200

