In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [19]:
import random

# Шаг а: Создание датасета
np.random.seed(42)

data = {'salary': np.random.randint(40000, 80000, 1000),
        'city': [random.choice(['Bishkek', 'Moscow', 'London']) for _ in range(1000)],
        'age': np.random.randint(30, 66, 1000),
        'vacation_prefer': [random.choice(['Shopping', 'Beach holiday']) for _ in range(1000)],
        'transport_prefer': [random.choice(['auto', 'plane']) for _ in range(1000)],
        'target': [random.choice(['Bishkek', 'Moscow', 'London']) for _ in range(1000)]}

df = pd.DataFrame(data)

In [12]:
# Шаг c: Разделение данных на обучающую и тестовую выборки
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Шаг d: Выбор модели классификатора
clf = RandomForestClassifier(random_state=42)

In [14]:
# Шаг e: Обучение модели
clf.fit(X_train, y_train)

In [15]:
# Шаг f: Проверка оценки модели
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.295
              precision    recall  f1-score   support

     Bishkek       0.21      0.24      0.22        62
      London       0.32      0.33      0.32        64
      Moscow       0.37      0.31      0.34        74

    accuracy                           0.29       200
   macro avg       0.30      0.29      0.30       200
weighted avg       0.30      0.29      0.30       200


In [23]:
# Шаг g: Предсказание на случайных данных
random_data = {'salary': [np.random.randint(40000, 80000)],
               'age': [np.random.randint(30, 66)],
               'city': [random.choice(['Bishkek', 'Moscow', 'London'])],
               'vacation_prefer': [random.choice(['Shopping', 'Beach holiday'])],
               'transport_prefer': [random.choice(['auto', 'plane'])]}

random_df = pd.DataFrame(random_data)

# Преобразование категориальных данных в числовые
random_df = pd.get_dummies(random_df, columns=['city', 'vacation_prefer', 'transport_prefer'])

# Убедимся, что порядок столбцов соответствует обучающему набору данных
missing_cols = set(X_train.columns) - set(random_df.columns)
for col in missing_cols:
    random_df[col] = 0

# Упорядочивание столбцов
random_df = random_df[X_train.columns]

prediction = clf.predict(random_df)

print(f'Predicted city for random data: {prediction[0]}')

Predicted city for random data: Bishkek
