In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#Загрузка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/datasets_for_colab/ensembles-competition.zip

Archive:  /content/drive/MyDrive/datasets_for_colab/ensembles-competition.zip
replace Sample_Submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
data_train_orig = pd.read_csv('train_contest.csv')
data_train = data_train_orig.copy()

data_for_pred_orig = pd.read_csv('for_prediction.csv')
data_for_pred = data_for_pred_orig.copy()

#Преобразования

In [None]:
import json
import re
import ast


def get_emp(x):
    res = re.findall(r"'name': [\"']([^']+)[\"']", x)
    if res == []:
        print(x)
    return res[0]

def get_name(x):
    return json.loads(x.replace("'", '"'))['name']

def data_process(data):
    data['type_schedule'] = data['schedule'].apply(get_name)

    data['city'] = data['area'].apply(get_name)

    data['name_employer'] = data['employer'].apply(get_emp)

    data['experience_tr'] = data['experience'].apply(get_name)

    return data


def extract_skills(skill_str):
    # Преобразуем строку в список словарей
    skills_list = ast.literal_eval(skill_str)
    # Извлекаем все значения 'name' из списка
    return [skill['name'] for skill in skills_list]


In [None]:
bool_cols = data_train_orig.select_dtypes(bool).columns
unbalanced_data = bool_cols
data_with_lots_nans = ['working_days', 'working_time_intervals', 'working_time_modes', 'relations', 'sort_point_distance', 'insider_interview']
useless_data = ['department', 'response_url', 'immediate_redirect_url', 'type', 'published_at', 'created_at', 'url', 'alternate_url', "address", "contacts"]
data_train = data_train_orig.drop([*unbalanced_data, *data_with_lots_nans, *useless_data], axis=1)


data_train = data_process(data_train)

col_to_drop1 = ['schedule',
              'area',
              'employer',
              'experience']

data_train.drop(columns = col_to_drop1, inplace = True)

# Применяем функцию для каждого элемента в столбце "skills"
data_train['skills'] = data_train['key_skills'].apply(extract_skills)

# Извлекаем все уникальные навыки из всех записей
all_skills = set([skill for sublist in data_train['skills'] for skill in sublist])

# Создаем новые столбцы для каждого навыка

for skill in all_skills:
    data_train[skill] = data_train['skills'].apply(lambda x: 1 if skill in x else 0)

data_train['experience_tr'] = data_train['experience_tr'].replace({
    'Нет опыта': 0,
    'От 1 года до 3 лет': 1,
    'От 3 до 6 лет': 2,
    'Более 6 лет': 3
})

data_train.drop(['key_skills', 'skills'], axis=1, inplace=True)
data_train = data_train.drop(['snippet', 'description', 'specializations', 'region'], axis=1)
data_train['experience_tr'] =pd.to_numeric(data_train['experience_tr'])
data_train.head(2)

Unnamed: 0,name,mean_salary,type_schedule,city,name_employer,experience_tr,Взаимодействие с клиентами,specflow,1С Предприятие,Ведение архива,...,проведение переговоров с первыми лицами компании,Обучение и развитие персонала,Протоколирование,DevOps,Cryptocurrency,Qlik Sense,Анализ АБС,разработка нового функционала в системе 1С,Управление ресурсами,криптографические средства информационной безопасности
0,Агент по недвижимости,125000.0,Гибкий график,Санкт-Петербург,АРИН,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,IT-специалист/ техник,50000.0,Вахтовый метод,Братск,СМП-38,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = data_train.drop(columns=["mean_salary"])
y = data_train["mean_salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
categorical = list(X_train.dtypes[X_train.dtypes == "object"].index)
categorical

['name', 'type_schedule', 'city', 'name_employer']

#Обучение

In [None]:
!pip install catboost



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import make_pipeline
import warnings
import pandas as pd
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')


param_grid = {'iterations': [300],
              'learning_rate': [0.1,0.5, 1],
              'depth': [5, 10]}

model = CatBoostRegressor(cat_features = categorical, loss_function='MAE')

gr = GridSearchCV(model, param_grid, cv=3, verbose=10, scoring='neg_mean_absolute_error')
gr.fit(X_train, y_train)


print(gr.best_params_)

#Тестирование

In [None]:
y_train_pred = gr.predict(X_train)
y_pred = gr.predict(X_test)

print("Test MSE = %.4f" % mean_squared_error(y_test, y_pred))
print('Test MAE = %.4f' % mean_absolute_error(y_test, y_pred) )
print('Test R2 = %.4f' % r2_score(y_test, y_pred) )
print("Train MSE = %.4f" % mean_squared_error(y_train, y_train_pred))
print('Train MAE = %.4f' % mean_absolute_error(y_train, y_train_pred) )
print('Train R2 = %.4f' % r2_score(y_train, y_train_pred) )

#Обучение лучшей модели на всем объеме данных

In [None]:
reg = CatBoostRegressor(**gr.best_params_, cat_features = categorical, loss_function='MAE', )

In [None]:
reg.fit(X, y)

0:	learn: 53033.2549724	total: 83.8ms	remaining: 25.1s
1:	learn: 50867.1333765	total: 156ms	remaining: 23.2s
2:	learn: 49100.3043198	total: 228ms	remaining: 22.6s
3:	learn: 47510.8969630	total: 309ms	remaining: 22.9s
4:	learn: 46338.0263577	total: 386ms	remaining: 22.8s
5:	learn: 45148.9426508	total: 471ms	remaining: 23.1s
6:	learn: 43915.4742562	total: 547ms	remaining: 22.9s
7:	learn: 42973.6824107	total: 625ms	remaining: 22.8s
8:	learn: 42172.0011444	total: 699ms	remaining: 22.6s
9:	learn: 41409.3257665	total: 781ms	remaining: 22.7s
10:	learn: 40694.5524136	total: 856ms	remaining: 22.5s
11:	learn: 40128.8818845	total: 936ms	remaining: 22.5s
12:	learn: 39616.5512112	total: 1.01s	remaining: 22.4s
13:	learn: 39154.3228027	total: 1.09s	remaining: 22.3s
14:	learn: 38795.9272533	total: 1.17s	remaining: 22.2s
15:	learn: 38491.7488051	total: 1.25s	remaining: 22.2s
16:	learn: 38246.3430644	total: 1.33s	remaining: 22.1s
17:	learn: 37903.1478610	total: 1.4s	remaining: 21.9s
18:	learn: 37620.396

<catboost.core.CatBoostRegressor at 0x7972dcfd4a10>

#Предсказываем для файла for_predictions

In [None]:
example = pd.read_csv('Sample_Submission.csv')
test_orig = pd.read_csv('for_prediction.csv')

test_orig.drop(columns = ['Id'], inplace = True)
test = test_orig.copy()

In [None]:
test.head(2)

Unnamed: 0,name,area,employer,snippet,contacts,schedule,description,experience,key_skills,specializations,region,type_schedule,city,name_employer,experience_tr
0,Помощник маркетолога,"{'id': '1', 'name': 'Москва', 'url': 'https://...","{'id': '2710310', 'name': 'EKO BIKE', 'url': '...","{'requirement': None, 'responsibility': 'Разра...",,"{'id': 'remote', 'name': 'Удаленная работа'}",<p>В IT-компанию Ищем Интернет-маркетолога (уд...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Контекстная реклама'}, {'name': 'Ин...","[{'id': '3.206', 'name': 'Печатная реклама', '...",Москва,Удаленная работа,Москва,EKO BIKE,От 1 года до 3 лет
1,Менеджер по персоналу,"{'id': '2', 'name': 'Санкт-Петербург', 'url': ...","{'id': '52824', 'name': 'Ваш дом, агентство не...","{'requirement': 'Высшее образование.', 'respon...","{'name': 'Корсунь Татьяна Дмитриевна', 'email'...","{'id': 'fullDay', 'name': 'Полный день'}",<p>Мы приглашаем на работу соискателей на долж...,"{'id': 'noExperience', 'name': 'Нет опыта'}","[{'name': 'Подбор персонала'}, {'name': 'Работ...","[{'id': '6.254', 'name': 'Рекрутмент', 'profar...",Санкт-Петербург,Полный день,Санкт-Петербург,"Ваш дом, агентство недвижимости",Нет опыта


In [None]:
class_drop = ['premium', 'department', 'has_test', 'response_letter_required',
                     'working_days', 'working_time_intervals', 'working_time_modes',
                     'immediate_redirect_url', 'published_at', 'created_at',
                     'response_url', 'sort_point_distance', 'archived', 'insider_interview',
                    'url', 'alternate_url', 'relations', 'type', 'address',
                    'accept_temporary']

test.drop(columns = class_drop, inplace = True)

test =  data_process(test)

# Создаем новые столбцы для каждого навыка
test['skills'] = test['key_skills'].apply(extract_skills)

for skill in all_skills:
    test[skill] = test['skills'].apply(lambda x: 1 if skill in x else 0)


for col in X:
    if col not in test:
        test[col] = 0

test.dropna()
col_to_drop1 = ['schedule', 'area', 'employer', 'experience', 'contacts', 'snippet',
                'description', 'skills', 'key_skills', 'specializations', 'region']

test.drop([*col_to_drop1, ], axis=1, inplace = True)

test['experience_tr'] = test['experience_tr'].replace({
    'Нет опыта': 0,
    'От 1 года до 3 лет': 1,
    'От 3 до 6 лет': 2,
    'Более 6 лет': 3
})

In [None]:
test.head(1)

Unnamed: 0,name,area,employer,snippet,contacts,schedule,description,experience,key_skills,specializations,region,type_schedule,city,name_employer,experience_tr
0,Помощник маркетолога,"{'id': '1', 'name': 'Москва', 'url': 'https://...","{'id': '2710310', 'name': 'EKO BIKE', 'url': '...","{'requirement': None, 'responsibility': 'Разра...",,"{'id': 'remote', 'name': 'Удаленная работа'}",<p>В IT-компанию Ищем Интернет-маркетолога (уд...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Контекстная реклама'}, {'name': 'Ин...","[{'id': '3.206', 'name': 'Печатная реклама', '...",Москва,Удаленная работа,Москва,EKO BIKE,От 1 года до 3 лет


In [None]:
test.head(1)

Unnamed: 0,name,type_schedule,city,name_employer,experience_tr,Взаимодействие с клиентами,specflow,1С Предприятие,Ведение архива,Проектно-конструкторская деятельность,...,проведение переговоров с первыми лицами компании,Обучение и развитие персонала,Протоколирование,DevOps,Cryptocurrency,Qlik Sense,Анализ АБС,разработка нового функционала в системе 1С,Управление ресурсами,криптографические средства информационной безопасности
0,Помощник маркетолога,Удаленная работа,Москва,EKO BIKE,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test[test.select_dtypes(include=np.number).columns] = test[test.select_dtypes(include=np.number).columns].astype('float')

In [None]:
predss = reg.predict(test)

In [None]:
example['Predicted'] = predss

In [None]:
example.to_csv('predictions.csv', index = False)