## Лабораторна №6

In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

cities = ['Bishkek', 'Almaty', 'Moscow', 'London', 'New York']
vacation_types = ['Shopping', 'Beach holiday', 'Cultural', 'Adventure']
transport_types = ['auto', 'plane', 'train', 'ship']
target_cities = ['London', 'Moscow', 'New York', 'Paris']

data = []
for _ in range(1000):
    salary = random.randint(30000, 120000)  
    city = random.choice(cities)  
    age = random.randint(30, 65)  
    vacation_prefer = random.choice(vacation_types)  
    transport_prefer = random.choice(transport_types)  
    target = random.choice(target_cities)  
    data.append([salary, city, age, vacation_prefer, transport_prefer, target])

df = pd.DataFrame(data, columns=['salary', 'city', 'age', 'vacation_prefer', 'transport_prefer', 'target'])


df_encoded = pd.get_dummies(df.drop('target', axis=1), drop_first=True)


X = df_encoded
y = df['target']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


y_pred_best = best_model.predict(X_test)
print("Accuracy with best parameters:", accuracy_score(y_test, y_pred_best))
print("Classification Report with best parameters:")
print(classification_report(y_test, y_pred_best))


random_data = pd.DataFrame({
    'salary': [random.randint(30000, 120000)],
    'age': [random.randint(30, 65)],
    'vacation_prefer_Beach holiday': [1],
    'vacation_prefer_Cultural': [0],
    'vacation_prefer_Shopping': [0],
    'vacation_prefer_Adventure': [0],
    'transport_prefer_auto': [1],
    'transport_prefer_plane': [0],
    'transport_prefer_train': [0],
    'transport_prefer_ship': [0],
    'city_Almaty': [0],
    'city_Bishkek': [0],
    'city_London': [0],
    'city_Moscow': [0],
    'city_New York': [1]  # Пример города
})


missing_cols = set(X.columns) - set(random_data.columns)
for col in missing_cols:
    random_data[col] = 0


random_data = random_data[X.columns]


random_prediction = best_model.predict(random_data)
print("Predicted vacation city:", random_prediction[0])


Accuracy: 0.27666666666666667
Classification Report:
              precision    recall  f1-score   support

      London       0.24      0.25      0.25        76
      Moscow       0.26      0.21      0.23        78
    New York       0.36      0.30      0.33        77
       Paris       0.26      0.36      0.30        69

    accuracy                           0.28       300
   macro avg       0.28      0.28      0.28       300
weighted avg       0.28      0.28      0.28       300

Cross-validation scores: [0.26  0.26  0.23  0.265 0.24 ]
Mean CV score: 0.251
Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy with best parameters: 0.23666666666666666
Classification Report with best parameters:
              precision    recall  f1-score   support

      London       0.24      0.25      0.24        76
      Moscow       0.19      0.13      0.15        78
    New York       0.28      0.22      0.25        77
       Paris       0.23      0.36      0.2