In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 102)

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
teach_df = pd.read_feather('teachers_info_clean.feather')
teach_inf_df = pd.read_feather('../datasets/suitable_teachers.feather')

In [3]:
merged_teachers = pd.merge(teach_inf_df, teach_df, left_on='teacher_id', right_on='id', how='left')

In [4]:
aggregated_data = merged_teachers.groupby('order_id').agg({
    # Уникальные идентификаторы
    'teacher_id': 'first',  
    'user_id': 'first',

    # Бинарные данные
    'enable_auto_assign': 'max',  
    'enable_assign': 'max',  
    'is_email_confirmed': 'max',
    'is_home_lessons': 'max',
    'is_external_lessons': 'max',
    'status_id.2': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
    'status_display.2': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
    'russian_level_id.2': 'first',
    'home_country_id.2': 'first',
    'is_confirmed.2': 'max',
    'is_display': 'max',
    'is_cell_phone_confirmed': 'max',
    'sex': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
    'show_on_map': 'max',
    'send_mailing': 'max',
    'send_suitable_orders': 'max',
    'is_edited': 'max',
    'verification_status_id': 'first',
    'is_individual': 'max',
    'has_photo': 'max',

    # Числовые данные
    'lesson_duration.2': 'mean',
    'lesson_cost.2': 'mean',
    'rating_id': 'mean',
    'rating': 'mean',
    'effective_rating': 'mean',
    'area_id': 'mean',
    'amount_to_pay': 'sum',  # Используем sum вместо mean
    'is_remote_lessons': 'mean',
    'rating_for_users': 'mean',
    'rating_for_admin': 'mean',
    'orders_allowed': 'sum',  # Используем sum вместо mean
    'display_days': 'mean',
    'star_rating': 'mean',
    'rating_for_users_yesterday': 'sum',  # Используем sum вместо mean
    'review_num': 'sum',  # Используем sum вместо mean
    'teacher_age': 'mean',
    'years_teaching': 'max',  # Используем max вместо mean
    'age_at_reg': 'mean',
    'years_since_reg': 'max',  # Используем max вместо mean
    'days_since_last_visit': 'mean',
    'reg_year': 'min',
    'reg_month': 'min',
    'reg_day': 'min',
    'teaching_start_year': 'min'
}).reset_index()


In [5]:
order_df = pd.read_feather('order_cleaned.feather')

In [6]:
final_df = pd.merge(order_df, aggregated_data, left_on='id', right_on='order_id', how='left')

In [7]:
duplicates = {}

# Перебор всех колонок для сравнения
for i, col1 in enumerate(final_df.columns):
    for col2 in final_df.columns[i + 1:]:
        # Проверка на идентичность данных в колонках
        if final_df[col1].equals(final_df[col2]):
            # Запись найденных дубликатов
            duplicates.setdefault(col1, []).append(col2)

# Вывод результатов
if duplicates:
    for key, values in duplicates.items():
        print(f"Колонка '{key}' дублируется в колонках: {values}")
else:
    print("Дублирующихся колонок не найдено.")

Дублирующихся колонок не найдено.


In [8]:
final_df = final_df.drop(columns=duplicates)

In [9]:
final_df

Unnamed: 0,id,order_date,subject_id,purpose,lesson_price,lesson_duration,home_metro_id,add_info,status_id,amount_to_pay_x,planned_lesson_number,creator_id,pupil_category_new_id,lessons_per_week,minimal_price,teacher_sex,teacher_experience_from,teacher_experience_to,lesson_place_new,pupil_knowledgelvl,teacher_age_from,teacher_age_to,chosen_teachers_only,no_teachers_available,source_id,original_order_id,client_id,is_display_to_teachers,student_category,exam_preparation,num_sessions_weekly,tutor_experience_years,tutor_age_from,client_orders_count,group_avg_lesson_price,group_avg_lesson_duration,unique_clients_in_group,group_min_lesson_price,group_max_lesson_price,group_avg_lessons_per_week,date_diff_within_group,price_deviation_from_group_avg,percent_of_60min_lessons_in_group,avg_lesson_cost,experience_range,order_weekday,order_month,order_year,is_weekend,day_of_year,order_id,teacher_id,user_id,enable_auto_assign,enable_assign,is_email_confirmed,is_home_lessons,is_external_lessons,status_id.2,status_display.2,russian_level_id.2,home_country_id.2,is_confirmed.2,is_display,is_cell_phone_confirmed,sex,show_on_map,send_mailing,send_suitable_orders,is_edited,verification_status_id,is_individual,has_photo,lesson_duration.2,lesson_cost.2,rating_id,rating,effective_rating,area_id,amount_to_pay_y,is_remote_lessons,rating_for_users,rating_for_admin,orders_allowed,display_days,star_rating,rating_for_users_yesterday,review_num,teacher_age,years_teaching,age_at_reg,years_since_reg,days_since_last_visit,reg_year,reg_month,reg_day,teaching_start_year
0,2294054,2022-11-01 00:01:38.237,1.0,,0,60,,,16,0.0,0,,,0,0,3,0,0,0,0,0,0,0,0,18,2294000.0,675960,1,unknown,0,,,,27,0.000000,60.0,1.0,0.0,0.0,0.000000,0.0,0.000000,100.000000,0.000000,0,1,11,2022,0,305,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2280950,2022-10-25 00:00:11.983,8.0,Разделы: высшая математика.\nДополнения: ЕГЭ.\...,1500,60,231.0,Желаемое время для занятий: Подстроюсь под реп...,16,0.0,2,,15.0,4,0,2,10,0,1,0,30,90,0,0,23,,1019824,1,unknown,1,,,,1,,,,,,,,,,25.000000,-10,1,10,2022,0,298,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1742636,2021-09-01 00:12:16.713,8.0,"Разделы: школьный курс, алгебра, алгебра логи...",1500,60,259.0,Район: м. Академическая.\nМесто проведения зан...,16,0.0,10,,9.0,3,900,3,0,0,3,0,18,80,0,0,16,,833270,1,unknown,0,,,,11,,,,,,,,,,25.000000,0,2,9,2021,0,244,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2478616,2023-02-12 10:04:36.413,10.0,Разделы: школьный курс.\nДополнения: ЕГЭ.\nКа...,1000,60,,"г. Екатеринбург, время +2 МСК.\nМесто проведен...",16,0.0,10,,11.0,2,0,2,3,0,4,0,25,0,0,0,23,,914952,1,unknown,1,,,,7,,,,,,,,,,16.666667,-3,6,2,2023,1,43,2478616.0,32161.0,33272.0,1.0,0.0,0.0,1.0,1.0,3.0,1.0,5.0,7.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,58.275862,796.551724,14.586207,5.034483,0.542430,15.344828,0.0,1.000000,1139.344849,562.956909,26.0,2009.896606,4.761719,32640.50000,405.0,42.344828,37.431896,36.241379,12.479124,580.620690,2011.0,1.0,1.0,1986.0
4,2398558,2023-01-07 00:00:12.137,19.0,Разделы: вокал.\nКатегория ученика: парень 27 ...,1200,60,1630.0,Район: г. Кондрово.\nМесто проведения занятий:...,14,300.0,10,,15.0,3,0,3,0,0,4,1,0,0,0,0,23,,1062677,1,unknown,0,,,,1,,,,,,,,,,20.000000,0,5,1,2023,1,7,2398558.0,230478.0,796778.0,1.0,0.0,1.0,1.0,1.0,5.0,1.0,5.0,7.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,54.736842,821.052632,15.368421,5.736842,1.421053,3.894737,0.0,0.947368,1839.842163,1254.684204,18.0,1899.763184,4.894326,35131.00000,359.0,39.578947,31.430527,34.368421,11.091034,546.210526,2013.0,1.0,1.0,1992.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1181887,2280502,2022-10-24 19:44:19.933,13.0,"Дополнения: ОГЭ (ГИА), подготовка к олимпиадам...",900,60,792.0,,16,0.0,10,,10.0,3,0,0,0,0,6,0,23,90,0,0,23,,1019632,1,school,1,,,,1,,,,,,,,,,15.000000,0,0,10,2022,0,297,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1181888,2280503,2022-10-24 19:45:30.277,9.0,Разделы: школьный курс.\nКатегория ученика: ...,1000,60,,Место проведения занятий: дистанционно (наприм...,6,2650.0,10,,8.0,2,800,2,3,10,4,0,21,30,0,0,23,,824363,1,unknown,0,,1.0,20.0,1,,,,,,,,,,16.666667,7,0,10,2022,0,297,2280503.0,150537.0,397801.0,1.0,0.0,0.0,1.0,1.0,3.0,1.0,5.0,7.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,59.166667,711.111111,13.222222,4.388889,0.294054,11.666667,0.0,1.000000,559.833313,263.055542,18.0,1244.277832,4.677951,10077.00000,70.0,26.722222,14.431211,22.722222,11.282683,541.277778,2012.0,1.0,3.0,2009.0
1181889,2280504,2022-10-24 19:45:40.713,1.0,Разделы: школьный курс.\nКатегория ученика: ш...,1000,120,,Место проведения занятий: дистанционно (наприм...,16,0.0,10,983555.0,6.0,2,0,2,0,0,4,0,18,45,0,0,14,2280289.0,362987,1,unknown,0,,,18.0,17,333.333333,80.0,1.0,0.0,1000.0,0.666667,0.0,666.666667,66.666667,8.333333,0,0,10,2022,0,297,2280504.0,259823.0,956472.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,5.0,7.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,3.0,0.0,1.0,59.464286,458.928571,15.071429,5.000000,0.559431,17.071429,0.0,1.000000,944.214294,481.736603,26.0,1465.785767,4.946289,26437.00000,310.0,31.250000,25.431896,26.642857,12.205339,544.535714,2011.0,2.0,2.0,1998.0
1181890,2280505,2022-10-24 19:45:44.477,2.0,Разделы: общий курс.\nКатегория ученика: 1 кла...,600,60,489.0,Район: Ленинский район.\nМесто проведения заня...,16,0.0,10,,2.0,3,500,2,0,0,2,1,21,60,1,0,23,,1019633,1,school,0,,,21.0,1,,,,,,,,,,10.000000,0,0,10,2022,0,297,2280505.0,84161.0,85495.0,1.0,0.0,1.0,1.0,1.0,5.0,1.0,5.0,7.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,60.000000,264.285714,14.000000,5.000000,0.000000,4.000000,0.0,0.714286,305.571442,122.642860,7.0,2017.000000,4.600446,2139.00000,8.0,31.285714,13.431896,25.571429,8.895277,648.000000,2015.0,1.0,4.0,2010.0


In [10]:
final_df.reset_index(drop=True).to_feather('final.feather')