In [13]:
import pandas as pd
import random
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(iris_df.drop('target', axis=1), iris_df['target'], test_size=0.2, random_state=42)
rf_iris = RandomForestClassifier(random_state=42)
rf_iris.fit(X_train_iris, y_train_iris)
y_pred_iris = rf_iris.predict(X_test_iris)
print('Accuracy score for iris dataset:', accuracy_score(y_test_iris, y_pred_iris))
print('Confusion matrix for iris dataset:\n', confusion_matrix(y_test_iris, y_pred_iris))
print('Classification report for iris dataset:\n', classification_report(y_test_iris, y_pred_iris))

Accuracy score for iris dataset: 1.0
Confusion matrix for iris dataset:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification report for iris dataset:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [15]:
load_digits = load_digits()
load_digits_df = pd.DataFrame(load_digits.data, columns=load_digits.feature_names)
load_digits_df['target'] = load_digits.target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(load_digits_df.drop('target', axis=1), load_digits_df['target'], test_size=0.2, random_state=42)
rf_bc = RandomForestClassifier(random_state=42)
rf_bc.fit(X_train_bc, y_train_bc)
y_pred_bc = rf_bc.predict(X_test_bc)
print('Accuracy score for load digits dataset:', accuracy_score(y_test_bc, y_pred_bc))
print('Confusion matrix for load digits dataset:\n', confusion_matrix(y_test_bc, y_pred_bc))
print('Classification report for load digits dataset:\n', classification_report(y_test_bc, y_pred_bc))
cv_scores = cross_val_score(rf_bc, load_digits_df.drop('target', axis=1), load_digits_df['target'], cv=5)
print('Cross-validation scores for load digits dataset:', cv_scores)
param_grid = {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 2, 4, 6, 8], 'min_samples_split': [2, 4, 6, 8]}
grid_search = GridSearchCV(rf_bc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_bc, y_train_bc)
print('Best parameters for load digits dataset:', grid_search.best_params_)
print('Best score for load digits dataset:', grid_search.best_score_)
rf_bc_opt = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'], min_samples_split=grid_search.best_params_['min_samples_split'], random_state=42)
rf_bc_opt.fit(X_train_bc, y_train_bc)
y_pred_bc_opt = rf_bc_opt.predict(X_test_bc)
print('Accuracy score for load digits dataset (optimized):', accuracy_score(y_test_bc, y_pred_bc_opt))
print('Confusion matrix for load digits dataset (optimized):\n', confusion_matrix(y_test_bc, y_pred_bc_opt))
print('Classification report for load digits dataset (optimized):\n', classification_report(y_test_bc, y_pred_bc_opt))

Accuracy score for load digits dataset: 0.9722222222222222
Confusion matrix for load digits dataset:
 [[32  0  0  0  1  0  0  0  0  0]
 [ 0 28  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0 32  0  1  0  0  1  0]
 [ 0  0  0  0 46  0  0  0  0  0]
 [ 0  0  0  0  0 45  1  0  0  1]
 [ 0  0  0  0  0  1 34  0  0  0]
 [ 0  0  0  0  0  0  0 33  0  1]
 [ 0  1  0  0  0  0  0  0 29  0]
 [ 0  0  0  0  0  1  0  1  0 38]]
Classification report for load digits dataset:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.97      1.00      0.98        28
           2       1.00      1.00      1.00        33
           3       1.00      0.94      0.97        34
           4       0.98      1.00      0.99        46
           5       0.94      0.96      0.95        47
           6       0.97      0.97      0.97        35
           7       0.97      0.97      0.97        34
           8       0.97      0.9

In [16]:

salary_list = [50000, 60000, 70000, 80000, 90000, 100000]
city_list = ['Бишкек', 'Талас', 'Нарын', 'Астана', 'Алмата', 'Балыкчы']
age_list = list(range(30, 66))
vacation_prefer_list = ['Путешествия', 'Океанские каникулы', 'Сноубординг', 'Треккинг', 'Театральные постановки', 'Релаксация']
transport_prefer_list = ['автомобиль', 'самолёт', 'поезд', 'автобус','Лайнер']

target_list = ['Барселона', 'Рим', 'Кейптаун', 'Сидней', 'Киото', 'Бали']

vacation_df = pd.DataFrame(columns=['salary', 'city', 'age', 'vacation_prefer', 'transport_prefer', 'target'])
for i in range(1000):
  vacation_df.loc[i] = [random.choice(salary_list), random.choice(city_list), random.choice(age_list), random.choice(vacation_prefer_list), random.choice(transport_prefer_list), random.choice(target_list)]
vacation_df.head()

Unnamed: 0,salary,city,age,vacation_prefer,transport_prefer,target
0,50000,Алмата,50,Релаксация,поезд,Сидней
1,50000,Нарын,50,Театральные постановки,поезд,Барселона
2,100000,Нарын,62,Релаксация,поезд,Кейптаун
3,60000,Алмата,62,Сноубординг,автомобиль,Бали
4,80000,Астана,55,Сноубординг,автомобиль,Бали


In [17]:
vacation_df.dtypes
vacation_df['salary'] = vacation_df['salary'].astype(float)
vacation_df.head()

Unnamed: 0,salary,city,age,vacation_prefer,transport_prefer,target
0,50000.0,Алмата,50,Релаксация,поезд,Сидней
1,50000.0,Нарын,50,Театральные постановки,поезд,Барселона
2,100000.0,Нарын,62,Релаксация,поезд,Кейптаун
3,60000.0,Алмата,62,Сноубординг,автомобиль,Бали
4,80000.0,Астана,55,Сноубординг,автомобиль,Бали


In [18]:
vacation_df = pd.get_dummies(vacation_df, columns=['city', 'vacation_prefer', 'transport_prefer', 'target'])
vacation_df.head()

Unnamed: 0,salary,age,city_Алмата,city_Астана,city_Балыкчы,city_Бишкек,city_Нарын,city_Талас,vacation_prefer_Океанские каникулы,vacation_prefer_Путешествия,...,transport_prefer_автобус,transport_prefer_автомобиль,transport_prefer_поезд,transport_prefer_самолёт,target_Бали,target_Барселона,target_Кейптаун,target_Киото,target_Рим,target_Сидней
0,50000.0,50,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
1,50000.0,50,False,False,False,False,True,False,False,False,...,False,False,True,False,False,True,False,False,False,False
2,100000.0,62,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
3,60000.0,62,True,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
4,80000.0,55,False,True,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False


In [19]:
X_vacation = vacation_df.drop(['target_Бали', 'target_Киото', 'target_Сидней', 'target_Кейптаун', 'target_Рим', 'target_Барселона'], axis=1)
y_vacation = vacation_df['target_Киото']
X_train_vacation, X_test_vacation, y_train_vacation, y_test_vacation = train_test_split(X_vacation, y_vacation, test_size=0.2, random_state=42)


KeyError: "['target_London', 'target_Moscow', 'target_Paris', 'target_New York', 'target_Dubai', 'target_Tokyo'] not found in axis"

In [None]:
rf_vacation = RandomForestClassifier(random_state=42)

In [None]:
rf_vacation.fit(X_train_vacation, y_train_vacation)
y_pred_vacation = rf_vacation.predict(X_test_vacation)
print('Accuracy score for vacation dataset:', accuracy_score(y_test_vacation, y_pred_vacation))
print('Confusion matrix for vacation dataset:\n', confusion_matrix(y_test_vacation, y_pred_vacation))
print('Classification report for vacation dataset:\n', classification_report(y_test_vacation, y_pred_vacation))

In [None]:
random_data = pd.DataFrame(columns=X_vacation.columns)
random_data.loc[0] = [random.choice(salary_list), random.choice(age_list)] + [random.randint(0, 1) for i in range(len(random_data.columns) - 2)]
random_data
random_pred = rf_vacation.predict(random_data)
print('Predicted class for random data:', random_pred)

In [None]:
cv_scores_vacation = cross_val_score(rf_vacation, X_vacation, y_vacation, cv=5)
print('Cross-validation scores for vacation dataset:', cv_scores_vacation)
param_grid_vacation = {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 2, 4, 6, 8], 'min_samples_split': [2, 4, 6, 8]}
grid_search_vacation = GridSearchCV(rf_vacation, param_grid_vacation, cv=5, scoring='accuracy')
grid_search_vacation.fit(X_train_vacation, y_train_vacation)
print('Best parameters for vacation dataset:', grid_search_vacation.best_params_)
print('Best score for vacation dataset:', grid_search_vacation.best_score_)

In [None]:
rf_vacation_opt = RandomForestClassifier(n_estimators=grid_search_vacation.best_params_['n_estimators'], max_depth=grid_search_vacation.best_params_['max_depth'], min_samples_split=grid_search_vacation.best_params_['min_samples_split'], random_state=42)
rf_vacation_opt.fit(X_train_vacation, y_train_vacation)
y_pred_vacation_opt = rf_vacation_opt.predict(X_test_vacation)
print('Accuracy score for vacation dataset (optimized):', accuracy_score(y_test_vacation, y_pred_vacation_opt))
print('Confusion matrix for vacation dataset (optimized):\n', confusion_matrix(y_test_vacation, y_pred_vacation_opt))
print('Classification report for vacation dataset (optimized):\n', classification_report(y_test_vacation, y_pred_vacation_opt))

## Вывод из работы