In [1]:
# все импорты
import numpy as np
import pandas as pd

# Датасет 1. Heart Disease.

In [2]:
# первый взгляд на данные
df = pd.read_csv('Downloads/heart_2020_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


The dataset contains 18 variables (9 booleans, 5 strings and 4 decimals). In machine learning projects, "HeartDisease" can be used as the explonatory variable, but note that the classes are heavily unbalanced.

HeartDisease
Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI)

BMI
Body Mass Index (BMI)

Smoking
Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes]

AlcoholDrinking
Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week

Stroke
(Ever told) (you had) a stroke?

PhysicalHealth
Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days)

MentalHealth
Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days)

DiffWalking
Do you have serious difficulty walking or climbing stairs?

Sex
Are you male or female?

AgeCategory
Fourteen-level age category

Race
Imputed race/ethnicity value

Diabetic
(Ever told) (you had) diabetes?

PhysicalActivity
Adults who reported doing physical activity or exercise during the past 30 days other than their regular job

GenHealth
Would you say that in general your health is...

SleepTime
On average, how many hours of sleep do you get in a 24-hour period?

Asthma
(Ever told) (you had) asthma?

KidneyDisease
Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?

SkinCancer
(Ever told) (you had) skin cancer?

### 1. Обработка пропущенных значений (если они, конечно, есть)

In [3]:
df.isna().sum()
# Пропущенных значений нет. Чудесно:>

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

### 2. Работаем с типами данных.

In [4]:
df.dtypes
# Так, вот тут будет веселье. Начнем!!

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [5]:
# Посмотрим на признаки типа object, обычно с ними больше всего работы
# Так, предлагаю сразу создать список признаков с двумя возможными категориями, чтобы потом поменять их на тип boolean
to_boolean = []
for i in df.select_dtypes(include='object'):
    print('-'*15,'\n', i)
    print('Количество уникальных элементов:', len(df[i].unique()))
    print('Уникальные элементы:', df[i].unique())
    if len(df[i].unique()) == 2:
        to_boolean.append(i)
print('\n\nПризнаки с двумя категориями: ', to_boolean)

--------------- 
 HeartDisease
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 Smoking
Количество уникальных элементов: 2
Уникальные элементы: ['Yes' 'No']
--------------- 
 AlcoholDrinking
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 Stroke
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 DiffWalking
Количество уникальных элементов: 2
Уникальные элементы: ['No' 'Yes']
--------------- 
 Sex
Количество уникальных элементов: 2
Уникальные элементы: ['Female' 'Male']
--------------- 
 AgeCategory
Количество уникальных элементов: 13
Уникальные элементы: ['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']
--------------- 
 Race
Количество уникальных элементов: 6
Уникальные элементы: ['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']
--------------- 
 Diabetic
Количество уникальных элемен

In [6]:
# тут много булевых признаков, которые pandas считает объектами из-за того, что вместо 0 и 1 там Yes и No
# сразу создадим маленький словарик, чтобы упростить себе жизнь
yn_to_bool = {'No':0, 'Yes':1}

# пол пока что не буду переводить в булевский признак, чтобы было легче интерпретировать результаты анализа
df['Sex'] = df['Sex'].map({'Male':0, 'Female':1}).astype('bool')
to_boolean.remove('Sex')

for i in to_boolean:
    df[i] = df[i].map(yn_to_bool).astype('bool')

In [7]:
#Все остальные объекты на самом деле категории, поменяем им тип
for i in df.select_dtypes(include='object'):
    df[i] = df[i].astype('category')

In [8]:
df.dtypes
#Прелесть:) 

HeartDisease            bool
BMI                  float64
Smoking                 bool
AlcoholDrinking         bool
Stroke                  bool
PhysicalHealth       float64
MentalHealth         float64
DiffWalking             bool
Sex                     bool
AgeCategory         category
Race                category
Diabetic            category
PhysicalActivity        bool
GenHealth           category
SleepTime            float64
Asthma                  bool
KidneyDisease           bool
SkinCancer              bool
dtype: object

### 4. Предсказываем!!

In [15]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [29]:
def fit_score(x, y):
    models = []
    x = pd.get_dummies(x, columns=x.select_dtypes(include=['category', 'object']).columns, drop_first=True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, random_state=17, stratify=y)
    log_reg = LogisticRegression(random_state=17) 
    forest = RandomForestClassifier(random_state=17)
    svm = LinearSVC(random_state=17)
    models.append(log_reg)
    models.append(forest)
    models.append(svm)
    
    for i in models:
        print('MODEL:', i)
        i.fit(x_train, y_train)
        res = i.predict(x_test)
        print('ACCURACY', accuracy_score(res, y_test))
        print('RECALL', recall_score(res, y_test))
        print('PRECISION', precision_score(res, y_test))
        print('F1 - SCORE', f1_score(res, y_test))
        print('-'*30, '\n')

In [17]:
fit_score(df.drop('HeartDisease', axis=1), df['HeartDisease'])

MODEL: LogisticRegression(random_state=17)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ACCURACY 0.9154566964425311
RECALL 0.533689126084056
PRECISION 0.0974184120798831
F1 - SCORE 0.16476161054474306
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.9048979038764214
RECALL 0.34
PRECISION 0.1179980516317584
F1 - SCORE 0.17519435906707648
------------------------------ 

MODEL: LinearSVC(random_state=17)




ACCURACY 0.879746505592095
RECALL 0.3303398305949587
PRECISION 0.394179249878227
F1 - SCORE 0.359447004608295
------------------------------ 



# ДАТАСЕТ 2. Kickstarter.

In [18]:
data = pd.read_csv('Downloads/archive/ks-projects-201801.csv')
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [19]:
print('Количество всех строк:', data.shape[0])
print('Количество уникальных ID:', len(data.ID.unique()))

#айди слишком громоздко выглядит, поэтому просто избавимся от него и оставим обычные индексы
data.drop('ID', inplace=True, axis=1)

#создадим нормальный столбец-таргет
data['is_successful'] = data['state'].apply(lambda x: 1 if x =='successful' else 0)
data.drop('state', axis=1, inplace = True)
data.head()

Количество всех строк: 378661
Количество уникальных ID: 378661


Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,0
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,0
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,0
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,0
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,0


### Работаем со странными аномалиями

In [20]:
#Люди говорят, что в данных есть аномалии со странами, посмотрим, что там не так
print(data.country.value_counts())

strange = data.loc[data['country']=='N,0"']
print(strange.head())
#замечаем, что тут есть один прикол: usd pledged везде nan, проверим кое-что:
print('Количество пустых значений в колонке "usd pledged": ', data['usd pledged'].isna().sum())
print('Среднее количество сторонников: ', strange.backers.mean())
print('Количество успехов: ', strange.is_successful.sum())
# удивительно, но при среднем количестве сторонников = 0, тут каким-то образом есть успешные проекты

# по всей видимости, это какие-то битые данные и использовать их для предсказания не стоит, так что выбросим
data.drop(data.loc[data['country']=='N,0"'].index, axis=0, inplace=True)
print(data.country.value_counts())

US      292627
GB       33672
CA       14756
AU        7839
DE        4171
N,0"      3797
FR        2939
IT        2878
NL        2868
ES        2276
SE        1757
MX        1752
NZ        1447
DK        1113
IE         811
CH         761
NO         708
HK         618
BE         617
AT         597
SG         555
LU          62
JP          40
Name: country, dtype: int64
                                          name      category main_category  \
169              STREETFIGHTERZ WHEELIE MURICA  Film & Video  Film & Video   
328                Duncan Woods - Chameleon EP         Music         Music   
632  The Making of Ashley Kelley's Debut Album         Music         Music   
647               Butter Side Down Debut Album         Music         Music   
749                    Chase Goehring debut EP         Music         Music   

    currency    deadline    goal             launched  pledged  backers  \
169      USD  2014-09-20  6500.0  2014-08-06 21:28:36   555.00        0   
328     

In [21]:
# выбросы заменим на среднее значение успешных проектов (ибо оно больше 75%)
data['backers'] = data['backers'].apply(lambda x: 264 if (x > 264) else x)

# теперь все более менее норм

In [22]:
#считаем среднее количество сторонников для каждой категории
data['mean_backers'] = data.groupby('category')['backers'].transform('mean')

data.head()

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,is_successful,mean_backers
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,0,21.865595
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,0,53.660112
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,0,53.660112
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,0,48.853148
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,0,34.449588


In [23]:
#не думаю, что категории маленькие еще пригодятся, поэтому избавимся от них
data.drop('category', axis=1, inplace=True)

In [24]:
#Создать разницу между дедлайном и запуском проекта и мб посчитать что-то, связанное с суммой, как новую фичу

data['deadline'] = pd.to_datetime(data['deadline']).dt.date
data['launched'] = pd.to_datetime(data['launched']).dt.date
data['how_many_time'] = data['deadline'] - data['launched']
data['how_many_time'] = data['how_many_time'].dt.days
border_value = data['how_many_time'].quantile(.99)
data['how_many_time'] = data['how_many_time'].apply(lambda x: border_value if (x>border_value) else x)

In [25]:
data.dtypes

name                 object
main_category        object
currency             object
deadline             object
goal                float64
launched             object
pledged             float64
backers               int64
country              object
usd pledged         float64
usd_pledged_real    float64
usd_goal_real       float64
is_successful         int64
mean_backers        float64
how_many_time       float64
dtype: object

In [26]:
data.drop(['launched', 'deadline', 'currency', 'name'], axis=1, inplace=True)

In [30]:
fit_score(data.drop('is_successful',axis=1), data['is_successful'])

MODEL: LogisticRegression(random_state=17)
ACCURACY 0.967250577983283
RECALL 0.96735091361062
PRECISION 0.9400089650363582
F1 - SCORE 0.9534839663033456
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.9940601102614263
RECALL 0.9843014128728415
PRECISION 0.999302719394362
F1 - SCORE 0.9917453413078938
------------------------------ 

MODEL: LinearSVC(random_state=17)




ACCURACY 0.8969944869286858
RECALL 0.9882767106432429
PRECISION 0.720066739715111
F1 - SCORE 0.833117238597401
------------------------------ 



# ДАТАСЕТ 3. PROMOTIONS!

In [43]:
# считываем данные и смотрим, как они вообще выглядят
prom = pd.read_csv('employee_promotion.csv', index_col = 'employee_id')

prom.head()

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0
2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0
48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0


In [44]:
# форматируем данные в нужный тип

prom.is_promoted = prom.is_promoted.astype('bool')
prom.region = prom.region.astype('category')
prom.department = prom.department.astype('category')
prom['gender'] = prom.gender.map(lambda x: True if x == 'm' else False)
prom.recruitment_channel = prom.recruitment_channel.astype('category')

In [49]:
prom.isna().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
awards_won              0
avg_training_score      0
is_promoted             0
is_newby                0
dtype: int64

In [46]:
prom.loc[prom['previous_year_rating'].isna()].describe()
prom['is_newby'] = (prom['previous_year_rating'].fillna(0) == 0)
prom['previous_year_rating'] = prom['previous_year_rating'].fillna(0)

In [47]:
prom.loc[prom['avg_training_score'].isna()].describe()
prom['avg_training_score'] = prom['avg_training_score'].fillna(0)

In [48]:
prom.loc[prom['education'].isna()].describe()
prom['education'] = prom['education'].fillna('Without')
prom.education = prom.education.astype('category')

In [50]:
prom

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted,is_newby
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
65438,Sales & Marketing,region_7,Master's & above,False,sourcing,1,35,5.0,8,0,49.0,False,False
65141,Operations,region_22,Bachelor's,True,other,1,30,5.0,4,0,60.0,False,False
7513,Sales & Marketing,region_19,Bachelor's,True,sourcing,1,34,3.0,7,0,50.0,False,False
2542,Sales & Marketing,region_23,Bachelor's,True,other,2,39,1.0,10,0,50.0,False,False
48945,Technology,region_26,Bachelor's,True,other,1,45,3.0,2,0,73.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,Technology,region_14,Bachelor's,True,sourcing,1,48,3.0,17,0,78.0,False,False
74592,Operations,region_27,Master's & above,False,other,1,37,2.0,6,0,56.0,False,False
13918,Analytics,region_1,Bachelor's,True,other,1,27,5.0,3,0,79.0,False,False
13614,Sales & Marketing,region_9,Without,True,sourcing,1,29,1.0,2,0,0.0,False,False


In [52]:
fit_score(prom.drop(['is_promoted', 'region'], axis=1), prom['is_promoted'])

MODEL: LogisticRegression(random_state=17)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ACCURACY 0.9185063552879645
RECALL 0.7419354838709677
PRECISION 0.06571428571428571
F1 - SCORE 0.12073490813648294
------------------------------ 

MODEL: RandomForestClassifier(random_state=17)
ACCURACY 0.9306695858419997
RECALL 0.7117263843648208
PRECISION 0.31214285714285717
F1 - SCORE 0.4339622641509434
------------------------------ 

MODEL: LinearSVC(random_state=17)
ACCURACY 0.9148573861217539
RECALL 0.0
PRECISION 0.0
F1 - SCORE 0.0
------------------------------ 



  _warn_prf(average, modifier, msg_start, len(result))
