In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
from sklearn.cluster import KMeans
#import scikit_posthocs as sp
from scipy.stats import kruskal
from sklearn.utils import resample
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

pd.set_option('display.max_columns', None)  # Отображение всех столбцов
# pd.set_option('display.max_rows', None)  # Ото----бражение всех строк   

regions = pd.read_csv('regions.csv')
starts = pd.read_csv('starts.csv')
volunteers = pd.read_csv('volunteers.csv')


volunteers = volunteers[volunteers.athlete_age < 100]
starts = starts[starts.athlete_age < 80]
starts = starts[starts.finish_time < 5000]

starts['rain.1h'] = starts['rain.1h'].fillna(0)
starts['snow.1h'] = starts['snow.1h'].fillna(0)

cnt_ludei = starts.groupby(['event_name', 'event_date', 'region']).agg(
    cnt_runners = ('athlete_id', 'count')
)

ages_14 = starts[starts['athlete_age'] == 14]['athlete_id'].unique()
ages_15 = starts[starts['athlete_age'] == 15]['athlete_id'].unique()
common_athletes = set(ages_14) & set(ages_15)
starts = starts[~starts['athlete_id'].isin(common_athletes)]



starts['is_male'] = starts['athlete_gender'].apply(lambda x: 1 if x == 'мужской' else 0)
starts = starts[starts['athlete_age'] > 14]

volunteer_counts = volunteers.groupby(['event_date','event_name', 'volunteer_role_name']).size().unstack(fill_value=0)  
starts = starts.merge(volunteer_counts, on=['event_date','event_name'], how='left')  

starts = starts.dropna(subset=[col for col in starts.columns if col not in ['snow.1h', 'rain.1h']])   # удаление где 0 волонтеров в сумме было


unique_dates = (
    pd.concat([
        starts[['athlete_id', 'event_date']],
        volunteers[['athlete_id', 'event_date']]
    ])
    .drop_duplicates()
)
date_counts = unique_dates.groupby('athlete_id').size().reset_index(name='unique_event_count')
date_counts['is_repeated'] = (date_counts['unique_event_count'] > 1).astype(int)
starts = starts.merge(date_counts[['athlete_id', 'is_repeated']], on='athlete_id', how='left')


# starts = starts[~starts['region'].isin(['Республика Адыгея', 'Республика Карелия', 'Липецкая область', 'Сахалинская область', 'Архангельская область'])]  
starts = starts[starts.event_date <= '2023-11-15']


newbie = starts[starts['is_newbie'] == 1]
first_runs = newbie.sort_values(by='event_date').drop_duplicates('athlete_id')
didnt_come = first_runs[first_runs.is_repeated == 0]
come = first_runs[first_runs.is_repeated == 1]
come_with_didnt_come = pd.concat([come, didnt_come])
come_with_didnt_come['cnt_volunteers'] = come_with_didnt_come.iloc[:, 25:52].sum(axis=1)
come_with_didnt_come = come_with_didnt_come.merge(cnt_ludei,on=['event_name', 'event_date', 'region'], how="left")
print(len(come_with_didnt_come))

18114


In [2]:

unique_regions_starts = starts['region'].unique()
regions_filtered = regions[regions['region'].isin(unique_regions_starts)].reset_index(drop=True)
regions_filtered_1 = regions_filtered[regions_filtered['median_income_per_capita_rub'] < regions_filtered['median_income_per_capita_rub'].median()]
regions_filtered_2 = regions_filtered[regions_filtered['median_income_per_capita_rub'] >= regions_filtered['median_income_per_capita_rub'].median()]



In [3]:
regions_filtered_1

Unnamed: 0,region,per_capita_income_rub,median_income_per_capita_rub,milk_consumption_per_capita,sugar_consumption_per_capita,urban_population_pct,male_to_female_ratio,under_working_age_population_pct,working_age_population_pct,above_working_age_population_pct,dependency_ratio,birth_rate_per_1000,death_rate_per_1000,infant_mortality_per_1000_births,natural_increase_rate_per_1000,marriage_rate_per_1000,divorce_rate_per_1000,migration_increase_rate_per_10000,library_books_per_1000,library_users_total,milk_production_total,milk_production_per_cow,forest_coverage_pct,disease_incidence_infectious,disease_incidence_cancer,disease_incidence_blood,disease_incidence_endocrine,disease_incidence_nervous,disease_incidence_eye,disease_incidence_ear,disease_incidence_circulatory,disease_incidence_respiratory,disease_incidence_digestive,disease_incidence_skin,disease_incidence_musculoskeletal,disease_incidence_genitourinary,disease_incidence_congenital,disease_incidence_trauma,disease_incidence_all,air_pollutant_emissions,captured_air_pollutants_pct,wastewater_discharge,gross_preschool_coverage,qualified_workers_grads,mid_level_specialists_grads,bachelors_specialists_masters_grads,electronic_data_exchange_pct,personal_computer_households_pct,internet_access_households_pct,broadband_access_households_pct,daily_internet_usage_pct,bus_passenger_transport,buses_per_100000,unemployed_per_vacancy,stadiums_1500_seats,sports_facilities_fields,sports_facilities_gyms,sports_facilities_pools,alcoholism_patients_per_100k,drug_addiction_patients_per_100k,substance_abuse_patients_per_100k,disabled_population_per_1000,population_total,crimes_murder_attempts,crimes_grievous_bodily_harm,crimes_robbery,crimes_armed_robbery,crimes_hooliganism,crimes_drug_related,nominal_wages_avg_rub,poverty_rate_pct_region
1,Владимирская область,32321,27587.1,216.0,32.0,77.6,1203.0,16.2,55.5,28.3,800,6.6,16.8,4.2,-10.2,6.6,4.4,-23.0,6621,495,471.881,9008.0,51.6,22.9,13.9,1.8,8.9,9.1,22.3,25.0,26.3,567.4,20.3,40.9,30.2,37.1,1.6,99.1,115.7,73,15.5,91.32,88.628641,1.8,5.5,5.3,60.3,67.8,79.4,79.2,81.2,84.1,73.632036,0.321703,30.0,1776,847,71,1392.2,121.8,2.9,90.5,1342.2,54,168,381,28,6,1322,45677,10.4
7,Смоленская область,35094,27356.7,220.0,38.0,72.7,1211.0,15.8,56.1,28.1,783,6.2,16.8,4.2,-10.6,6.9,4.7,-52.0,9132,442,148.973,5223.0,41.8,15.7,7.3,2.2,11.6,9.1,19.0,18.9,30.1,393.2,24.7,31.8,18.6,24.6,2.7,88.5,70.5,48,70.2,44.35,78.45446,0.9,3.7,4.1,60.3,63.7,81.2,80.3,78.5,31.1,105.149701,0.309991,24.0,1320,925,67,1390.4,182.5,5.0,79.0,886.9,50,106,339,20,15,1426,41717,12.8
8,Тамбовская область,34092,27696.7,159.0,53.0,60.4,1169.0,14.2,55.7,30.1,794,6.8,16.8,4.9,-10.0,6.0,4.0,-33.0,9351,517,190.335,7472.0,10.6,16.3,6.8,2.0,9.0,10.7,16.7,20.0,29.1,387.0,12.1,21.4,18.9,23.9,1.6,62.8,106.7,62,15.7,40.37,93.270408,1.4,4.6,5.5,58.5,68.4,83.1,79.9,77.9,56.8,85.899094,0.33044,21.0,2878,656,41,1398.0,74.2,3.8,107.5,979.2,44,99,195,15,21,1575,39346,10.5
14,Калининградская область,34506,29129.9,238.0,46.0,76.6,1115.0,17.7,58.1,24.2,720,7.9,12.7,5.4,-4.8,9.4,5.5,61.0,4284,283,230.582,8427.0,18.8,28.3,18.6,2.1,10.1,12.3,22.7,15.4,47.8,349.4,29.5,39.9,33.0,33.8,2.4,44.3,96.9,27,66.8,88.12,80.648508,0.8,4.2,3.5,53.6,77.9,90.2,89.5,85.3,79.8,110.719015,0.509505,7.0,1134,470,51,761.4,116.9,1.5,66.3,1031.0,30,84,202,18,15,736,47349,12.0
16,Новгородская область,33729,27659.9,242.0,37.0,73.1,1229.0,17.7,54.5,27.8,834,7.4,17.1,4.3,-9.7,7.3,5.0,-1.0,8015,350,66.936,5302.0,64.4,24.5,11.8,1.7,11.9,7.8,17.3,14.3,35.3,503.3,12.1,21.5,17.2,45.4,2.0,77.8,76.1,62,70.9,22.24,87.566581,0.8,2.2,1.4,60.6,66.7,75.9,73.3,73.8,34.6,100.186482,0.411743,2.0,917,332,35,781.7,181.4,6.3,92.8,581.6,35,79,230,22,10,873,45247,11.9
17,Псковская область,33090,26488.6,292.0,37.0,70.9,1206.0,16.4,54.9,28.7,823,7.3,18.3,5.7,-11.0,7.9,5.0,-44.0,10766,263,208.928,7764.0,38.8,23.1,7.9,2.0,6.9,9.8,19.0,19.7,24.7,443.5,23.5,33.6,19.5,26.8,1.2,61.1,83.9,42,28.0,31.59,81.823623,0.7,2.3,1.9,54.9,54.9,73.9,73.8,78.1,33.9,140.697465,0.20752,7.0,639,339,19,1295.2,88.1,1.9,77.7,596.9,24,77,127,11,1,677,38966,14.3
20,Астраханская область,31119,25038.0,200.0,44.0,64.1,1129.0,21.2,56.3,22.5,776,10.1,12.7,5.6,-2.6,6.1,5.7,-50.0,6065,328,178.222,1712.0,1.8,13.2,7.7,3.3,15.1,17.4,27.8,27.4,41.9,375.6,26.8,12.8,35.9,34.3,2.8,47.2,66.9,104,11.8,35.11,63.033869,0.9,5.0,5.5,51.3,72.4,87.9,86.6,86.5,30.1,100.151806,0.619595,8.0,739,577,23,496.2,55.1,0.2,47.2,957.8,29,93,147,19,13,1135,47780,13.8
21,Волгоградская область,31309,26263.8,208.0,33.0,77.6,1135.0,16.3,57.9,25.8,726,7.1,13.7,4.4,-6.6,6.2,4.4,-25.0,5093,701,586.528,7801.0,4.2,17.6,8.2,1.8,12.1,8.9,21.1,22.4,24.6,358.8,17.1,33.2,19.7,25.5,0.8,75.3,69.2,217,38.1,85.92,73.702335,2.2,11.3,11.2,50.3,77.0,88.2,88.1,88.1,115.6,84.046643,0.176614,35.0,2518,1242,65,590.6,83.6,1.3,69.2,2492.8,95,242,886,79,10,1675,44242,9.6
23,Республика Северная Осетия — Алания,29235,23920.4,216.0,45.0,63.2,1142.0,20.0,57.7,22.3,734,11.0,11.2,4.1,-0.2,4.1,5.7,-65.0,4959,161,228.647,5499.0,24.4,17.8,5.6,1.8,7.3,11.9,37.8,34.1,26.3,302.1,11.9,36.8,15.7,22.5,1.4,40.0,43.0,8,32.1,88.55,71.321712,1.1,3.3,4.0,52.2,83.7,95.8,95.8,90.4,28.2,250.753583,2.893756,12.0,734,441,26,601.3,113.7,0.7,79.2,685.4,14,55,37,13,5,1407,36360,13.0
24,Ставропольский край,27626,23985.9,203.0,47.0,60.7,1121.0,18.4,58.4,23.2,712,8.7,11.6,4.7,-2.9,5.7,4.4,-10.0,5269,1133,538.476,7985.0,1.6,26.2,9.7,2.2,8.9,16.3,19.0,17.7,38.4,414.6,32.5,33.4,22.7,44.1,2.3,73.3,56.3,107,44.0,116.41,68.321138,2.2,14.2,14.2,61.3,67.6,85.5,84.6,85.4,55.6,71.803996,0.354557,40.0,2949,1196,116,687.9,121.9,5.0,78.0,2902.5,81,243,348,70,28,3326,41402,11.7


In [4]:
regions_filtered_2

Unnamed: 0,region,per_capita_income_rub,median_income_per_capita_rub,milk_consumption_per_capita,sugar_consumption_per_capita,urban_population_pct,male_to_female_ratio,under_working_age_population_pct,working_age_population_pct,above_working_age_population_pct,dependency_ratio,birth_rate_per_1000,death_rate_per_1000,infant_mortality_per_1000_births,natural_increase_rate_per_1000,marriage_rate_per_1000,divorce_rate_per_1000,migration_increase_rate_per_10000,library_books_per_1000,library_users_total,milk_production_total,milk_production_per_cow,forest_coverage_pct,disease_incidence_infectious,disease_incidence_cancer,disease_incidence_blood,disease_incidence_endocrine,disease_incidence_nervous,disease_incidence_eye,disease_incidence_ear,disease_incidence_circulatory,disease_incidence_respiratory,disease_incidence_digestive,disease_incidence_skin,disease_incidence_musculoskeletal,disease_incidence_genitourinary,disease_incidence_congenital,disease_incidence_trauma,disease_incidence_all,air_pollutant_emissions,captured_air_pollutants_pct,wastewater_discharge,gross_preschool_coverage,qualified_workers_grads,mid_level_specialists_grads,bachelors_specialists_masters_grads,electronic_data_exchange_pct,personal_computer_households_pct,internet_access_households_pct,broadband_access_households_pct,daily_internet_usage_pct,bus_passenger_transport,buses_per_100000,unemployed_per_vacancy,stadiums_1500_seats,sports_facilities_fields,sports_facilities_gyms,sports_facilities_pools,alcoholism_patients_per_100k,drug_addiction_patients_per_100k,substance_abuse_patients_per_100k,disabled_population_per_1000,population_total,crimes_murder_attempts,crimes_grievous_bodily_harm,crimes_robbery,crimes_armed_robbery,crimes_hooliganism,crimes_drug_related,nominal_wages_avg_rub,poverty_rate_pct_region
0,Белгородская область,41022,32811.0,269.0,45.0,65.4,1158.0,16.7,56.4,26.9,773,7.1,14.2,4.2,-7.1,7.4,4.3,-73.0,6044,826,721.776,8543.0,8.8,21.3,8.7,1.6,7.3,18.8,22.9,25.8,37.1,369.4,17.3,33.8,30.8,46.5,0.8,91.3,54.5,158,82.8,58.6,77.615965,1.6,6.8,8.7,57.2,60.2,78.1,77.5,73.7,102.0,92.438101,0.220984,22.0,3272,899,156,598.2,57.8,0.5,123.3,1536.5,102,131,212,21,2,1274,47638,6.1
2,Воронежская область,39319,30806.9,293.0,52.0,68.5,1165.0,16.0,56.4,27.6,773,7.5,15.3,3.6,-7.8,7.3,4.4,3.0,5967,865,1055.849,8645.0,8.3,11.8,6.4,2.0,11.2,8.0,20.5,15.2,37.0,280.6,21.2,19.0,12.1,21.5,0.7,57.9,135.8,115,52.6,121.46,76.13124,2.9,8.7,17.7,55.9,79.5,90.4,89.7,86.1,146.3,106.157577,0.511452,10.0,4296,1338,72,1284.8,217.6,4.1,93.9,2302.6,87,183,451,70,10,2091,46277,7.2
3,Калужская область,35509,31476.3,253.0,27.0,74.9,1109.0,16.8,57.0,26.2,753,7.9,14.2,3.4,-6.3,6.5,5.0,41.0,6675,409,489.544,8959.0,45.1,21.1,10.3,2.3,7.3,19.4,24.9,26.1,31.4,576.4,32.6,49.6,40.0,35.5,1.2,77.5,82.8,31,63.9,72.86,77.115963,0.6,3.9,3.7,58.1,72.6,79.2,75.8,77.1,50.1,105.42997,0.233263,15.0,930,625,76,762.1,104.7,1.0,76.8,1073.3,46,130,248,34,13,1187,53910,7.9
4,Курская область,37632,30339.6,195.0,54.0,68.5,1206.0,17.0,55.2,27.8,812,7.4,16.2,4.6,-8.8,7.0,4.7,-16.0,8195,492,436.559,9256.0,8.2,11.5,9.0,1.4,10.4,7.0,12.7,12.2,31.4,354.0,10.2,30.5,7.5,12.6,0.7,62.7,57.2,51,43.6,7.96,60.614676,1.6,5.4,7.1,59.5,64.0,83.4,83.4,82.1,72.5,106.463337,0.259202,9.0,1095,683,49,886.5,83.2,3.2,110.7,1078.2,89,100,152,19,5,855,46059,8.1
5,Липецкая область,38926,31756.8,223.0,51.0,63.0,1183.0,16.9,56.0,27.1,784,7.1,15.2,3.2,-8.1,6.6,4.3,-24.0,6908,456,308.107,8218.0,8.8,17.6,8.1,1.1,8.3,4.9,20.8,18.9,32.3,304.4,14.4,32.7,17.8,33.4,0.7,79.1,72.8,300,75.1,64.6,76.397823,1.4,4.8,4.0,59.9,64.4,77.8,77.8,76.7,88.9,108.411623,0.285507,22.0,2175,654,58,1098.2,110.1,2.0,100.9,1138.1,44,93,275,19,6,1212,46711,7.2
6,Московская область,55109,48004.7,250.0,43.0,78.3,1103.0,18.5,58.7,22.8,703,8.6,11.7,3.7,-3.1,7.4,4.3,90.0,1928,1756,731.698,8053.0,42.0,19.1,8.8,1.3,6.5,9.0,17.8,16.9,25.7,366.2,17.5,37.5,19.5,23.3,0.6,82.8,69.8,169,97.5,806.36,75.614165,5.7,21.0,15.3,54.6,83.1,89.7,88.8,89.3,672.0,113.062133,0.376523,119.0,3001,2528,348,763.6,178.3,4.3,54.433343,8542.3,344,776,1375,232,107,7257,70705,5.5
9,Тульская область,34442,29584.2,137.0,32.0,73.2,1211.0,14.6,56.0,29.4,787,6.4,16.6,4.8,-10.2,6.9,4.1,-0.208182,4444,533,201.352,8555.0,14.3,15.7,8.4,2.1,15.9,15.2,25.4,20.4,38.1,420.8,25.9,23.3,28.5,31.3,0.8,57.7,104.4,110,79.0,127.58,76.479557,1.5,6.2,5.9,53.8,84.0,90.5,88.9,83.8,63.4,131.355929,0.089644,17.0,1344,638,50,994.6,122.6,4.5,100.1,1496.7,55,153,283,47,4,1053,51218,9.4
10,Ярославская область,38060,30524.1,227.0,50.0,81.0,1226.0,17.7,55.3,27.0,809,7.7,16.2,3.7,-8.5,7.2,4.6,-7.0,6663,451,355.443,7861.0,45.7,22.4,13.3,2.0,14.4,10.4,25.5,26.0,19.8,529.0,22.6,37.0,27.5,38.6,1.8,108.5,97.5,87,14.4,146.39,89.043129,2.4,5.7,5.3,55.5,64.5,77.8,77.3,80.8,117.6,114.34742,0.407472,12.0,1299,569,33,842.0,107.1,3.5,77.7,1205.6,62,149,337,31,14,896,47388,8.8
11,Москва,95465,73458.6,233.0,42.0,100.0,1154.0,15.8,57.3,26.9,744,9.4,9.7,3.5,-0.3,7.9,3.7,71.0,6266,1371,4.38,7692.0,1.0,19.8,11.6,1.0,7.4,8.4,24.7,20.5,11.8,330.7,13.5,41.1,14.8,32.8,1.3,87.3,82.8,65,61.0,736.55,53.872487,5.7,39.2,176.1,50.6,94.4,96.7,96.2,92.4,1036.9,71.152885,0.385396,27.0,16313,4471,499,323.2,159.4,2.9,77.591836,13015.1,413,543,1537,374,206,10334,125638,5.0
12,Республика Коми,46638,34633.8,254.0,34.0,77.7,1161.0,18.8,57.5,23.7,740,8.7,14.3,4.0,-5.6,6.8,5.2,-53.0,6388,347,54.379,5232.0,72.6,41.7,17.9,3.6,10.8,10.6,25.5,20.7,22.3,589.9,19.4,67.6,21.3,53.8,2.0,126.1,139.5,352,34.2,168.12,106.899281,1.1,2.8,2.2,53.2,65.5,80.3,80.3,82.8,44.6,87.550968,0.596464,7.0,772,503,43,1120.1,107.5,2.3,77.1,734.4,70,201,296,23,8,891,68790,14.0


In [5]:
# Определяем условия
conditions = [
    come_with_didnt_come['region'].isin(regions_filtered_1['region']),
    come_with_didnt_come['region'].isin(regions_filtered_2['region'])
]

# Определяем соответствующие значения
choices = ['lower', 'higher']

# Создаем новый столбец 'median'
come_with_didnt_come['median'] = np.select(conditions, choices, default=np.nan)



In [6]:
come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 46) & (come_with_didnt_come['cnt_runners'] >= 2)].drop_duplicates(['event_name', 'event_date'])

Unnamed: 0,record_id,athlete_id,event_name,event_date,city,region,finish_time,athlete_gender,athlete_age,is_newbie,temp,feels_like,pressure,humidity,dew_point,clouds,wind_speed,wind_deg,weather.main,weather.description,rain.1h,snow.1h,track_description_surface,is_male,Ведущий велосипед,Завершение мероприятия,Замыкающий,Инструктаж новых участников,Координатор парковки,Координация волонтёров,Лидер для слабовидящих,Маршал,Обработка результатов,Организатор,Организация финиша,Пейсер,Подготовка мероприятия,Помощь в раздаче карточек позиций,Проведение разминки,Проверка карточек позиций,Проверка трассы,Раздача карточек позиций,Разное,Связи с общественностью,Секундомер,Сканирование штрих-кодов,Сортировка карточек,Составление отчёта,Сурдопереводчик,Фотограф,Хранение и доставка оборудования,is_repeated,cnt_volunteers,cnt_runners,median
0,4790,790069326,Елагин остров,2023-01-01,Санкт-Петербург,Санкт-Петербург,1837,женский,38,1,3.94,-1.20,996,88,2.14,100,8.00,260,Clouds,overcast clouds,0.00,0.00,смешанное,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1,15.0,41,higher
2,4782,790126900,Волгоград панорама,2023-01-01,Волгоград,Волгоградская область,1723,женский,37,1,1.74,-3.68,1025,87,-0.16,75,7.00,260,Clouds,broken clouds,0.00,0.00,твердое,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1,13.0,30,lower
4,4822,790126893,Раменское Городской парк,2023-01-01,Раменское,Московская область,2380,мужской,52,1,1.29,-2.06,1008,100,1.29,100,3.13,180,Drizzle,light intensity drizzle,0.00,0.22,твердое,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,0.0,1,11.0,19,higher
6,4819,790126890,Интернационалистов,2023-01-01,Санкт-Петербург,Санкт-Петербург,1333,мужской,36,1,3.91,-1.24,993,88,2.11,100,8.00,260,Rain,light rain,0.25,0.00,твердое,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1,13.0,11,higher
7,4794,790126885,Самара парк Гагарина,2023-01-01,Самара,Самарская область,1437,мужской,32,1,-1.91,-1.91,1017,94,-2.65,95,1.31,302,Clouds,overcast clouds,0.00,0.00,твердое,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1,12.0,40,higher
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18026,12376,790157928,Красноярск набережная,2023-11-11,Красноярск,Красноярский край,1499,мужской,45,1,-12.90,-18.56,1038,92,-13.82,76,2.68,211,Clouds,broken clouds,0.00,0.00,твердое,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,3.0,0.0,0,16.0,23,higher
18032,12395,790158116,Южноуральск,2023-11-11,Южноуральск,Челябинская область,2621,женский,35,1,4.41,1.18,1013,93,3.38,100,3.92,150,Rain,light rain,0.43,0.00,твердое,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0,11.0,24,lower
18054,12496,790157712,Волгоград панорама,2023-11-11,Волгоград,Волгоградская область,1900,женский,24,1,3.81,3.81,1026,93,2.78,0,1.00,240,Clear,clear sky,0.00,0.00,твердое,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,5.0,0.0,1.0,0.0,2.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0,21.0,44,lower
18062,12543,790158254,Владикавказ Дендрарий,2023-11-11,Владикавказ,Республика Северная Осетия — Алания,1848,мужской,20,1,10.23,9.32,1025,77,6.38,11,2.67,181,Clouds,few clouds,0.00,0.00,грунт,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0,6.0,9,lower


для медианы ниэе 

In [7]:
new_come = come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 46) & (come_with_didnt_come['cnt_runners'] >= 2)].drop_duplicates(['event_name', 'event_date'])
new_come = come_with_didnt_come[come_with_didnt_come['median'] == 'lower']
X = new_come
y = new_come 
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 2) & (come_with_didnt_come.cnt_runners <= 46) & (come_with_didnt_come.median == 'lower')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

0
Optimization terminated successfully.
         Current function value: 0.665728
         Iterations 5


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                 5243
Model:                          Logit   Df Residuals:                     5215
Method:                           MLE   Df Model:                           27
Date:                Mon, 16 Dec 2024   Pseudo R-squ.:                 0.02428
Time:                        12:50:08   Log-Likelihood:                -3490.4
converged:                       True   LL-Null:                       -3577.3
Covariance Type:            nonrobust   LLR p-value:                 2.215e-23
                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------

In [8]:
new_come = come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 138) & (come_with_didnt_come['cnt_runners'] >= 46)].drop_duplicates(['event_name', 'event_date'])
new_come = come_with_didnt_come[come_with_didnt_come['median'] == 'lower']
X = new_come
y = new_come 
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.median == 'lower')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

0
Optimization terminated successfully.
         Current function value: 0.665728
         Iterations 5


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                 5243
Model:                          Logit   Df Residuals:                     5215
Method:                           MLE   Df Model:                           27
Date:                Mon, 16 Dec 2024   Pseudo R-squ.:                 0.02428
Time:                        12:50:11   Log-Likelihood:                -3490.4
converged:                       True   LL-Null:                       -3577.3
Covariance Type:            nonrobust   LLR p-value:                 2.215e-23
                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------

Значимо маршал и инструктаж

In [9]:
new_come = come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 404) & (come_with_didnt_come['cnt_runners'] >= 138)].drop_duplicates(['event_name', 'event_date'])
new_come = come_with_didnt_come[come_with_didnt_come['median'] == 'lower']
X = new_come
y = new_come 
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.median == 'lower')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

0
Optimization terminated successfully.
         Current function value: 0.665728
         Iterations 5


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                 5243
Model:                          Logit   Df Residuals:                     5215
Method:                           MLE   Df Model:                           27
Date:                Mon, 16 Dec 2024   Pseudo R-squ.:                 0.02428
Time:                        12:51:13   Log-Likelihood:                -3490.4
converged:                       True   LL-Null:                       -3577.3
Covariance Type:            nonrobust   LLR p-value:                 2.215e-23
                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------

In [10]:
new_come = come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 46) & (come_with_didnt_come['cnt_runners'] >= 2)].drop_duplicates(['event_name', 'event_date'])
new_come = come_with_didnt_come[come_with_didnt_come['median'] == 'higher']
X = new_come
y = new_come 
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.median == 'lower')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

0
Optimization terminated successfully.
         Current function value: 0.645654
         Iterations 6


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                12871
Model:                          Logit   Df Residuals:                    12843
Method:                           MLE   Df Model:                           27
Date:                Mon, 16 Dec 2024   Pseudo R-squ.:                 0.06779
Time:                        12:52:41   Log-Likelihood:                -8310.2
converged:                       True   LL-Null:                       -8914.5
Covariance Type:            nonrobust   LLR p-value:                1.218e-237
                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------

мало записей

In [12]:
new_come = come_with_didnt_come[(come_with_didnt_come['cnt_runners'] <= 404) & (come_with_didnt_come['cnt_runners'] >= 138)].drop_duplicates(['event_name', 'event_date'])
new_come = come_with_didnt_come[come_with_didnt_come['median'] == 'higher']
X = new_come
y = new_come 
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.median == 'lower')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

0
Optimization terminated successfully.
         Current function value: 0.645654
         Iterations 6


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                12871
Model:                          Logit   Df Residuals:                    12843
Method:                           MLE   Df Model:                           27
Date:                Mon, 16 Dec 2024   Pseudo R-squ.:                 0.06779
Time:                        12:53:57   Log-Likelihood:                -8310.2
converged:                       True   LL-Null:                       -8914.5
Covariance Type:            nonrobust   LLR p-value:                1.218e-237
                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------

мало записей

In [16]:
X = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Северо-Западный')]
y = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Северо-Западный')]
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Северо-Западный')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

152




Optimization terminated successfully.
         Current function value: 0.658461
         Iterations 6


LinAlgError: Singular matrix

мало записей

In [19]:
X = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 2) & (come_with_didnt_come.cnt_runners <= 46) & (come_with_didnt_come.federal_district == 'Уральский')]
y = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 2) & (come_with_didnt_come.cnt_runners <= 46) & (come_with_didnt_come.federal_district == 'Уральский')]
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 2) & (come_with_didnt_come.cnt_runners <= 46) & (come_with_didnt_come.federal_district == 'Уральский')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

167




Optimization terminated successfully.
         Current function value: 0.611533
         Iterations 6


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                  505
Model:                          Logit   Df Residuals:                      478
Method:                           MLE   Df Model:                           26
Date:                Sun, 15 Dec 2024   Pseudo R-squ.:                  0.1026
Time:                        00:32:37   Log-Likelihood:                -308.82
converged:                       True   LL-Null:                       -344.15
Covariance Type:            nonrobust   LLR p-value:                 5.345e-06
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------

Сканирование штрих кодов значимость

In [20]:
X = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Уральский')]
y = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Уральский')]
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 46) & (come_with_didnt_come.cnt_runners <= 138) & (come_with_didnt_come.federal_district == 'Уральский')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

104
Optimization terminated successfully.
         Current function value: 0.641088
         Iterations 6


Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей


                           Logit Regression Results                           
Dep. Variable:            is_repeated   No. Observations:                 1096
Model:                          Logit   Df Residuals:                     1068
Method:                           MLE   Df Model:                           27
Date:                Sun, 15 Dec 2024   Pseudo R-squ.:                 0.06170
Time:                        00:40:41   Log-Likelihood:                -702.63
converged:                       True   LL-Null:                       -748.83
Covariance Type:            nonrobust   LLR p-value:                 4.390e-09
                                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------

Сканирование штрих кодов, пейсер

In [21]:
X = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 138) & (come_with_didnt_come.cnt_runners <= 404) & (come_with_didnt_come.federal_district == 'Уральский')]
y = come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 138) & (come_with_didnt_come.cnt_runners <= 404) & (come_with_didnt_come.federal_district == 'Уральский')]
print(len(come_with_didnt_come[(come_with_didnt_come.cnt_runners >= 138) & (come_with_didnt_come.cnt_runners <= 404) & (come_with_didnt_come.federal_district == 'Уральский')].drop_duplicates(['event_name', 'event_date'])))


need_columns = ['finish_time', 'athlete_age', 'feels_like', 'pressure', 'humidity', 'wind_speed', 
                'Ведущий велосипед', 
                'Завершение мероприятия', 'Замыкающий', 'Инструктаж новых участников', 'Маршал', 'Обработка результатов', 'Организатор',
                'Организация финиша', 'Пейсер', 'Подготовка мероприятия', 
            'Проверка карточек позиций', 'Проверка трассы', 'Раздача карточек позиций',
                'Разное', 'Связи с общественностью', 'Секундомер', 'Сканирование штрих-кодов', 'Сортировка карточек',
                'Фотограф', 'Хранение и доставка оборудования',
                  'cnt_runners'
                  ]
                






for col1 in need_columns:
    for col2 in need_columns:
        if col1 != col2:
            try:
                coef = X[col1].corr(X[col2],method='spearman')
                if abs(coef) > 0.8:
                    print('Возможна мультиколлинеарность: ', coef, col1, col2)
            except:
                print('ошибка в цикле на проверке корреляций между ', col1, " ", col2)

for_model = X[need_columns]


for_model = sm.add_constant(for_model)
y = y['is_repeated']
model = sm.Logit(y, for_model).fit()
print('\n')
print('Модель для предсказывания is_repeated новичка, в пробежках где было 2-46 людей')
print('\n')
print(model.summary())
marginal_effects = model.get_margeff()
print('\n')
print('Предельные эффекты:')
print(marginal_effects.summary())
print('\n')

y_pred = model.predict(for_model)
y_pred_class = (y_pred > 0.5).astype(int)  
cm = confusion_matrix(y, y_pred_class)
print(cm)
accuracy = (y_pred_class == y).mean()  
print(f'Accuracy: {accuracy:.2f}')  

41




Optimization terminated successfully.
         Current function value: 0.648587
         Iterations 5


LinAlgError: Singular matrix