In [1]:
# все импорты
import numpy as np
import pandas as pd

# разделение выборки на тренировочную и тестовую
from sklearn.model_selection import train_test_split

# модели
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# метрики качества
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
# функция оценивания данных
def fit_score(x, y):
    models = []
    x = pd.get_dummies(x, columns=x.select_dtypes(include=['category', 'object']).columns, drop_first=True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, random_state=17, stratify=y)
    log_reg = LogisticRegression(random_state=17) 
    forest = RandomForestClassifier(random_state=17)
    svm = LinearSVC(random_state=17)
    models.append(log_reg)
    models.append(forest)
    models.append(svm)
    
    for i in models:
        print('MODEL:', i)
        i.fit(x_train, y_train)
        res = i.predict(x_test)
        print('ACCURACY', accuracy_score(res, y_test))
        print('RECALL', recall_score(res, y_test))
        print('PRECISION', precision_score(res, y_test))
        print('F1 - SCORE', f1_score(res, y_test))
        print('-'*30, '\n')

# функция вывода пропущенных значений
def na(df):
    df_na = df.isna().sum().reset_index()
    df_na = df_na.rename(columns={'index':'columns', 0:'total_missing'})
    df_na['%%%'] = df_na['total_missing']*100/df.shape[0] 
    return df_na


def to_bool(df):
    print(df.dtypes)
    # смотрим на категориальные признаки
    to_boolean = []
    for i in df.select_dtypes(include='object'):
        print('-'*15,'\n', i)
        print('Количество уникальных элементов:', len(df[i].unique()))
        print('Уникальные элементы:', df[i].unique())
        if len(df[i].unique()) == 2:
            to_boolean.append(i)
    print('-'*30)
    print('\n\nПризнаки с двумя категориями: ', to_boolean)
    print('-'*30)
    answer = input('Перевести признаки в булевый тип?\n')
    if answer == 'Ya':
        for i in to_boolean:
            print(i)
            dict_bool = {df[i].unique()[0]:0, df[i].unique()[1]:1}
            df[i] = df[i].map(dict_bool).astype('bool')
            print('ОБОЗНАЧЕНИЯ: ', dict_bool)
            print('-'*30)
    return df

# Датасет 1. Heart Disease.
https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease

In [3]:
# первый взгляд на данные
heart = pd.read_csv('Downloads/heart_2020_cleaned.csv')
heart.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


## Предобработка.

In [4]:
# посмотрим на общие сведения по данным
heart.describe()
# стандартное отклонение у всех числовых признаков небольшое, поэтому можно обойтись без нормализации

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [5]:
# смотрим на пропуски
na(heart)
# круто, пропусков нет

Unnamed: 0,columns,total_missing,%%%
0,HeartDisease,0,0.0
1,BMI,0,0.0
2,Smoking,0,0.0
3,AlcoholDrinking,0,0.0
4,Stroke,0,0.0
5,PhysicalHealth,0,0.0
6,MentalHealth,0,0.0
7,DiffWalking,0,0.0
8,Sex,0,0.0
9,AgeCategory,0,0.0


In [6]:
# переведем некоторые категориальные признаки в булевые
heart = to_bool(heart)

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object
--------------- 
 HeartDisease
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 Smoking
Количество уникальных элементов: 2
Уникальные элементы: ['Yes' 'No']
--------------- 
 AlcoholDrinking
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 Stroke
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 DiffWalking
Количество уникальных элементов: 2
Уникальные элементы: ['No

In [7]:
#Все остальные объекты на самом деле категории, поменяем им тип
for i in heart.select_dtypes(include='object'):
    heart[i] = heart[i].astype('category')

In [8]:
heart.dtypes
#Прелесть:) 

HeartDisease            bool
BMI                  float64
Smoking                 bool
AlcoholDrinking         bool
Stroke                  bool
PhysicalHealth       float64
MentalHealth         float64
DiffWalking             bool
Sex                     bool
AgeCategory         category
Race                category
Diabetic            category
PhysicalActivity        bool
GenHealth           category
SleepTime            float64
Asthma                  bool
KidneyDisease           bool
SkinCancer              bool
dtype: object

## Предсказания.

In [9]:
fit_score(heart.drop('HeartDisease', axis=1), heart['HeartDisease'])

MODEL: LogisticRegression(random_state=17)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ACCURACY 0.9154358498629337
RECALL 0.5322055953155498
PRECISION 0.09961032635168046
F1 - SCORE 0.1678120832905939
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.905106369672396
RECALL 0.3437281009110021
PRECISION 0.11945932781295665
F1 - SCORE 0.17729983733959878
------------------------------ 

MODEL: LinearSVC(random_state=17)
ACCURACY 0.9035637227821846
RECALL 0.3721101819970487
PRECISION 0.1842425718460789
F1 - SCORE 0.2464570776999511
------------------------------ 





Предсказания не очень хорошие, но это связано с тем, что целевой признак несбалансирован. Вообще я могу это исправить с помощью SMOTE, но это не входит в задание, поэтому не буду перегружать код.

# ДАТАСЕТ 2. Kickstarter.
https://www.kaggle.com/datasets/kemical/kickstarter-projects

In [10]:
kick = pd.read_csv('Downloads/archive/ks-projects-201801.csv')
kick.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [11]:
print('Количество всех строк:', kick.shape[0])
print('Количество уникальных ID:', len(kick.ID.unique()))

#айди слишком громоздко выглядит, поэтому просто избавимся от него и оставим обычные индексы
kick.drop('ID', inplace=True, axis=1)

#создадим нормальный столбец-таргет
kick['is_successful'] = kick['state'].apply(lambda x: 1 if x =='successful' else 0)
kick.drop('state', axis=1, inplace = True)
kick.head()

Количество всех строк: 378661
Количество уникальных ID: 378661


Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,0
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,0
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,0
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,0
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,0


In [12]:
# посмотрим на пропуски
na(kick)
# пропуски есть в двух столбцах: столбец name я отброшу, потому что у каждого проекта свое уникальное название

Unnamed: 0,columns,total_missing,%%%
0,name,4,0.001056
1,category,0,0.0
2,main_category,0,0.0
3,currency,0,0.0
4,deadline,0,0.0
5,goal,0,0.0
6,launched,0,0.0
7,pledged,0,0.0
8,backers,0,0.0
9,country,0,0.0


In [13]:
kick.drop('name', axis=1, inplace=True)

In [14]:
# посмотрим подробнее на пропуски в столбце usd pledged
kick.loc[kick['usd pledged'].isna()]
# видим, что тут везде очень странные значения страны, разберемся с этим подробнее

Unnamed: 0,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful
169,Film & Video,Film & Video,USD,2014-09-20,6500.0,2014-08-06 21:28:36,555.00,0,"N,0""",,555.00,6500.00,0
328,Music,Music,AUD,2015-08-25,4500.0,2015-08-04 12:05:17,4767.00,0,"N,0""",,3402.08,3211.53,0
632,Music,Music,USD,2015-04-09,3500.0,2015-03-10 20:06:13,3576.00,0,"N,0""",,3576.00,3500.00,0
647,Music,Music,USD,2015-11-26,6000.0,2015-11-02 22:09:19,7007.80,0,"N,0""",,7007.80,6000.00,0
749,Music,Music,USD,2016-03-21,3000.0,2016-02-23 03:09:49,3660.38,0,"N,0""",,3660.38,3000.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378233,Film & Video,Film & Video,USD,2015-02-03,7500.0,2014-12-05 04:19:14,10.00,0,"N,0""",,10.00,7500.00,0
378303,Film & Video,Film & Video,CAD,2014-05-23,3000.0,2014-04-08 00:30:09,3102.00,0,"N,0""",,2845.61,2752.04,0
378434,Music,Music,USD,2016-02-05,5000.0,2016-01-06 21:59:23,235.00,0,"N,0""",,235.00,5000.00,0
378585,Music,Music,GBP,2015-10-19,2000.0,2015-09-21 22:33:18,2125.00,0,"N,0""",,3273.36,3080.81,0


### Работаем со странными странами:)

In [15]:
strange = kick.loc[kick['country']=='N,0"']
strange.head()

Unnamed: 0,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful
169,Film & Video,Film & Video,USD,2014-09-20,6500.0,2014-08-06 21:28:36,555.0,0,"N,0""",,555.0,6500.0,0
328,Music,Music,AUD,2015-08-25,4500.0,2015-08-04 12:05:17,4767.0,0,"N,0""",,3402.08,3211.53,0
632,Music,Music,USD,2015-04-09,3500.0,2015-03-10 20:06:13,3576.0,0,"N,0""",,3576.0,3500.0,0
647,Music,Music,USD,2015-11-26,6000.0,2015-11-02 22:09:19,7007.8,0,"N,0""",,7007.8,6000.0,0
749,Music,Music,USD,2016-03-21,3000.0,2016-02-23 03:09:49,3660.38,0,"N,0""",,3660.38,3000.0,0


In [16]:
#замечаем, что тут есть один прикол: usd pledged везде nan, проверим кое-что:
print('Количество пустых значений в колонке "usd pledged": ', kick['usd pledged'].isna().sum())
print('Среднее количество сторонников: ', strange.backers.mean())
print('Количество успехов: ', strange.is_successful.sum())
# удивительно, но при среднем количестве сторонников = 0, тут каким-то образом есть успешные проекты

# по всей видимости, это какие-то битые данные и использовать их для предсказания не стоит, так что выбросим
kick.drop(kick.loc[kick['country']=='N,0"'].index, axis=0, inplace=True)

Количество пустых значений в колонке "usd pledged":  3797
Среднее количество сторонников:  0.0
Количество успехов:  105


In [17]:
# выбросы заменим на среднее значение успешных проектов (ибо оно больше 75%)
kick['backers'] = kick['backers'].apply(lambda x: 264 if (x > 264) else x)
# теперь все более менее норм

In [18]:
#считаем среднее количество сторонников для каждой категории
kick['mean_backers'] = kick.groupby('category')['backers'].transform('mean')

kick.head()

Unnamed: 0,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful,mean_backers
0,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,0,21.865595
1,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,0,53.660112
2,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,0,53.660112
3,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,0,48.853148
4,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,0,34.449588


In [19]:
#не думаю, что категории маленькие еще пригодятся, поэтому избавимся от них
kick.drop('category', axis=1, inplace=True)

In [20]:
#Создать разницу между дедлайном и запуском проекта и мб посчитать что-то, связанное с суммой, как новую фичу

kick['deadline'] = pd.to_datetime(kick['deadline']).dt.date
kick['launched'] = pd.to_datetime(kick['launched']).dt.date
kick['how_many_time'] = kick['deadline'] - kick['launched']
kick['how_many_time'] = kick['how_many_time'].dt.days
border_value = kick['how_many_time'].quantile(.99)
kick['how_many_time'] = kick['how_many_time'].apply(lambda x: border_value if (x>border_value) else x)

In [21]:
kick.dtypes

main_category        object
currency             object
deadline             object
goal                float64
launched             object
pledged             float64
backers               int64
country              object
usd pledged         float64
usd_pledged_real    float64
usd_goal_real       float64
is_successful         int64
mean_backers        float64
how_many_time       float64
dtype: object

In [22]:
kick.drop(['launched', 'deadline', 'currency'], axis=1, inplace=True)

In [23]:
fit_score(kick.drop('is_successful',axis=1), kick['is_successful'])

MODEL: LogisticRegression(random_state=17)
ACCURACY 0.967250577983283
RECALL 0.96735091361062
PRECISION 0.9400089650363582
F1 - SCORE 0.9534839663033456
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.9940601102614263
RECALL 0.9843014128728415
PRECISION 0.999302719394362
F1 - SCORE 0.9917453413078938
------------------------------ 

MODEL: LinearSVC(random_state=17)
ACCURACY 0.8969944869286858
RECALL 0.9882767106432429
PRECISION 0.720066739715111
F1 - SCORE 0.833117238597401
------------------------------ 





# ДАТАСЕТ 3. PROMOTIONS!
https://www.kaggle.com/datasets/muhammadimran112233/employees-evaluation-for-promotion

In [24]:
# считываем данные и смотрим, как они вообще выглядят
prom = pd.read_csv('employee_promotion.csv', index_col = 'employee_id')

prom.head()

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0
2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0
48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0


In [25]:
# форматируем данные в нужный тип

prom.is_promoted = prom.is_promoted.astype('bool')
prom.region = prom.region.astype('category')
prom.department = prom.department.astype('category')
prom['gender'] = prom.gender.map(lambda x: True if x == 'm' else False)
prom.recruitment_channel = prom.recruitment_channel.astype('category')

In [26]:
na(prom)

Unnamed: 0,columns,total_missing,%%%
0,department,0,0.0
1,region,0,0.0
2,education,2409,4.395344
3,gender,0,0.0
4,recruitment_channel,0,0.0
5,no_of_trainings,0,0.0
6,age,0,0.0
7,previous_year_rating,4124,7.524449
8,length_of_service,0,0.0
9,awards_won,0,0.0


## Обработка пропущенных значений.
Подробнее об этом я написалав отчете

In [27]:
prom.loc[prom['previous_year_rating'].isna()].describe()
prom['is_newby'] = (prom['previous_year_rating'].fillna(0) == 0)
prom['previous_year_rating'] = prom['previous_year_rating'].fillna(0)

In [28]:
prom.loc[prom['avg_training_score'].isna()].describe()
prom['avg_training_score'] = prom['avg_training_score'].fillna(0)

In [29]:
prom.loc[prom['education'].isna()].describe()
prom['education'] = prom['education'].fillna('Without')
prom.education = prom.education.astype('category')

In [30]:
prom

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted,is_newby
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
65438,Sales & Marketing,region_7,Master's & above,False,sourcing,1,35,5.0,8,0,49.0,False,False
65141,Operations,region_22,Bachelor's,True,other,1,30,5.0,4,0,60.0,False,False
7513,Sales & Marketing,region_19,Bachelor's,True,sourcing,1,34,3.0,7,0,50.0,False,False
2542,Sales & Marketing,region_23,Bachelor's,True,other,2,39,1.0,10,0,50.0,False,False
48945,Technology,region_26,Bachelor's,True,other,1,45,3.0,2,0,73.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,Technology,region_14,Bachelor's,True,sourcing,1,48,3.0,17,0,78.0,False,False
74592,Operations,region_27,Master's & above,False,other,1,37,2.0,6,0,56.0,False,False
13918,Analytics,region_1,Bachelor's,True,other,1,27,5.0,3,0,79.0,False,False
13614,Sales & Marketing,region_9,Without,True,sourcing,1,29,1.0,2,0,0.0,False,False


In [31]:
fit_score(prom.drop(['is_promoted', 'region'], axis=1), prom['is_promoted'])

MODEL: LogisticRegression(random_state=17)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ACCURACY 0.9185063552879645
RECALL 0.7419354838709677
PRECISION 0.06571428571428571
F1 - SCORE 0.12073490813648294
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.9306695858419997
RECALL 0.7117263843648208
PRECISION 0.31214285714285717
F1 - SCORE 0.4339622641509434
------------------------------ 

MODEL: LinearSVC(random_state=17)
ACCURACY 0.9148573861217539
RECALL 0.0
PRECISION 0.0
F1 - SCORE 0.0
------------------------------ 



  _warn_prf(average, modifier, msg_start, len(result))
