In [86]:
import pandas as pd
import random

In [87]:
num_rows = 1000 

cities = ["Almaty", "Tashkent", "Baku", "Astana", "Yerevan", "Minsk"]
vacation_types = ["Adventure", "Cruise", "Cultural", "Nature", "Luxury", "Camping"]
transport_types = ["Bicycle", "Helicopter", "Submarine", "Motorcycle", "Tram"]
vacation_targets = ["Rome", "Venice", "Kyoto", "Istanbul", "Buenos Aires", "Cape Town", "Seoul", "Dubai", "Sydney"]

In [88]:
data = {
    "salary": [random.randint(25000, 500000) for _ in range(num_rows)],
    "city": [random.choice(cities) for _ in range(num_rows)],
    "age": [random.randint(25, 70) for _ in range(num_rows)],
    "vacation_prefer": [random.choice(vacation_types) for _ in range(num_rows)],
    "transport_prefer": [random.choice(transport_types) for _ in range(num_rows)],
    "target": [random.choice(vacation_targets) for _ in range(num_rows)],
}

In [89]:
df = pd.DataFrame(data)

print(df.head())

df.to_csv("updated_vacation_data.csv", index=False)

   salary    city  age vacation_prefer transport_prefer    target
0  310053   Minsk   29          Cruise        Submarine  Istanbul
1  246246   Minsk   37          Luxury             Tram  Istanbul
2  437473  Astana   28          Nature       Helicopter      Rome
3  400632  Almaty   42          Cruise       Motorcycle      Rome
4  270511    Baku   65          Cruise        Submarine     Dubai


In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [91]:
df = pd.read_csv("updated_vacation_data.csv")

df_encoded = pd.get_dummies(df, columns=["city", "vacation_prefer", "transport_prefer", "target"], drop_first=True)

X = df_encoded.drop("salary", axis=1)
y = (df_encoded["salary"] > 250000).astype(int) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)  # изменили размер тестовой выборки

print("Размеры выборок:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Размеры выборок:
X_train: (750, 23)
X_test: (250, 23)
y_train: (750,)
y_test: (250,)


In [92]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [93]:
model = RandomForestClassifier(random_state=777)
model.fit(X_train, y_train)

print(f"Модель обучена с использованием {len(model.estimators_)} деревьев в лесу.")

Модель обучена с использованием 100 деревьев в лесу.


In [94]:
y_pred = model.predict(X_test)

print("Оценка модели:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Оценка модели:
Accuracy: 0.4880

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.58      0.52       119
           1       0.51      0.40      0.45       131

    accuracy                           0.49       250
   macro avg       0.49      0.49      0.49       250
weighted avg       0.49      0.49      0.48       250


In [95]:
random_data = {
    "age": [np.random.randint(30, 65)],
    "city_Almaty": [1],
    "city_Tashkent": [0],
    "city_Baku": [0],
    "city_Astana": [0],
    "city_Yerevan": [0],
    "city_Minsk": [0],  # Добавляем остальные города
    "vacation_prefer_Adventure": [1],
    "vacation_prefer_Cruise": [0],
    "vacation_prefer_Cultural": [0],
    "vacation_prefer_Nature": [0],
    "vacation_prefer_Luxury": [0],
    "vacation_prefer_Camping": [0],
    "transport_prefer_Bicycle": [0],
    "transport_prefer_Helicopter": [1],
    "transport_prefer_Submarine": [0],
    "transport_prefer_Motorcycle": [0],
    "transport_prefer_Tram": [0],
    "target_Rome": [0],
    "target_Venice": [0],
    "target_Kyoto": [1],
    "target_Istanbul": [0],
    "target_Buenos Aires": [0],
    "target_Cape Town": [0],
    "target_Seoul": [0],
    "target_Dubai": [0],
    "target_Sydney": [0],
}


In [96]:
random_df = pd.DataFrame(random_data)

random_df = random_df.reindex(columns=X_train.columns, fill_value=0)

prediction = model.predict(random_df)

print("\nПредсказание для случайных данных:")
print(f"Класс: {'Высокая зарплата' if prediction[0] == 1 else 'Низкая зарплата'}")



Предсказание для случайных данных:
Класс: Низкая зарплата


In [97]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [98]:
model = RandomForestClassifier(random_state=555)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

In [99]:
print("перекрестная проверка:")
print(f"Средняя точность: {cv_scores.mean():.4f}")
print(f"Дисперсия точности: {cv_scores.std():.4f}")
print(f"на каждом из разбиений: {cv_scores}")

перекрестная проверка:
Средняя точность: 0.4970
Дисперсия точности: 0.0379
на каждом из разбиений: [0.49  0.46  0.485 0.57  0.48 ]


In [100]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [101]:

kf = KFold(n_splits=5, shuffle=True, random_state=555)

model = RandomForestClassifier(random_state=555)


cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

In [102]:
print("перекрестная проверка с kfold")
print(f"Средняя точность: {cv_scores.mean():.4f}")
print(f"Дисперсия точности: {cv_scores.std():.4f}")
print(f"на каждом из разбиений: {cv_scores}")

перекрестная проверка с kfold
Средняя точность: 0.4710
Дисперсия точности: 0.0208
на каждом из разбиений: [0.435 0.49  0.485 0.485 0.46 ]


In [103]:
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [104]:
twmp = LeaveOneOut()

model = RandomForestClassifier(random_state=555)

cv_scores = cross_val_score(model, X, y, cv=twmp, scoring='accuracy')

In [105]:
print("перекрестная проверка с LeaveOneOut:")
print(f"Средняя точность: {cv_scores.mean():.4f}")
print(f"Количество проверок: {len(cv_scores)}")

перекрестная проверка с LeaveOneOut:
Средняя точность: 0.4820
Количество проверок: 1000


In [106]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [107]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4], 
}

In [108]:
model = RandomForestClassifier(random_state=555)


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5, 
    n_jobs=-1,  
    verbose=1  
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [109]:
print("Лучшие параметры:", grid_search.best_params_)
print(f"Лучшая точность: {grid_search.best_score_:.4f}")

Лучшие параметры: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Лучшая точность: 0.5120


In [110]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [111]:
print("\nОценка лучшей модели на тестовых данных:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))



Оценка лучшей модели на тестовых данных:
Accuracy: 0.4840
              precision    recall  f1-score   support

           0       0.46      0.53      0.49       119
           1       0.51      0.44      0.47       131

    accuracy                           0.48       250
   macro avg       0.49      0.49      0.48       250
weighted avg       0.49      0.48      0.48       250
