# Задание

**Задача:** необходимо создать библиотеку AutoML для решения задачи бинарной классификации.

На вход должны поступать данные с факторами и таргетом, а на выходе - модель. 

# План

1. Считываем данные
2. Сводная информация о датасете
3. Генерирование фичей.
4. Делаем перебор моделей: Dummy , Logreg и сводный отчет
  - Заполняем пропуски.
5. Рекомендации по отчету.


====== Дополнительно =========
6. Визуализация данных
7. Оптимизация датасета
8. Зафиксировать seed

In [2]:
    #Отключим предупреждения Anaconda
    import warnings
    warnings.simplefilter('ignore')

    # Подключаем графические модули:
    # будем отображать графики прямо в jupyter'e
    %matplotlib inline
    import seaborn as sns
    import matplotlib.pyplot as plt

    # будем отображать графики прямо в jupyter'e
    %pylab inline

    #графики в svg выглядят более четкими
    #%config InlineBackend.figure_format = 'svg' 

    #увеличим дефолтный размер графиков
    from pylab import rcParams
    rcParams['figure.figsize'] = 8, 5
    from matplotlib import pyplot


    #Подключаем модули для работы с ML
    import numpy as np
    import pandas as pd

    #Для моделей
    from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
    from sklearn.metrics import mean_squared_error

    from scipy import stats


    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.dummy import DummyClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline

    #from sklearn.learning_curve import validation_curve
    #from sklearn.learning_curve import learning_curve
    from sklearn.metrics import accuracy_score ,roc_auc_score


    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedKFold
    from sklearn.ensemble import RandomForestClassifier

    from collections.abc import Iterable
    from itertools import product

Populating the interactive namespace from numpy and matplotlib


In [3]:
import category_encoders as ce

## 1. Скачиваем данные

In [3]:
df = pd.read_csv('../ml_data/train.csv')

target_col = 'Survived'

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 1. 1  Формируем отчет по фичам DF

In [141]:
# Формируем отчет по фичам DF

dict_null = df.isnull().sum().to_dict()
dict_type = df.dtypes.to_dict()

dict_unique = dict()
for column in df.columns:
    dict_unique[column] = len(df[column].unique())


report_df = df.describe(include='all').T.rename_axis('features').reset_index()
report_df['feat_count_null'] = report_df['features'].map(dict_null)
report_df['feat_type'] = report_df['features'].map(dict_type)
report_df['unique'] = report_df['features'].map(dict_unique)



report_df.head(20)

Unnamed: 0,features,count,unique,top,freq,mean,std,min,25%,50%,75%,max,feat_count_null,feat_type
0,PassengerId,891,891,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0,0,int64
1,Survived,891,2,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0,0,int64
2,Pclass,891,3,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0,0,int64
3,Name,891,891,"Nicola-Yarred, Master. Elias",1.0,,,,,,,,0,object
4,Sex,891,2,male,577.0,,,,,,,,0,object
5,Age,714,89,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0,177,float64
6,SibSp,891,7,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0,0,int64
7,Parch,891,7,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0,0,int64
8,Ticket,891,681,347082,7.0,,,,,,,,0,object
9,Fare,891,248,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329,0,float64


### 1. 2.  Формируем отчет по таргету  DF

In [142]:
# # Формируем отчет по фичам DF

target_report = df.groupby(target_col).count().reset_index()

dict_count_traget = df[target_col].value_counts().to_dict()
dict_count_traget_norm = df[target_col].value_counts(normalize=True).to_dict()

target_report['count'] = target_report[target_col].map(dict_count_traget)
target_report['count_norm'] = target_report[target_col].map(dict_count_traget_norm)

target_report.head()


Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count,count_norm
0,0,549,549,549,549,424,549,549,549,549,68,549,549,0.616162
1,1,342,342,342,342,290,342,342,342,342,136,340,342,0.383838


In [9]:
# Посмотрим размер датасета и пропущенные значения.
print("Размер датасета:", df.shape)

Размер датасета: (891, 12)


In [None]:
# Перебор классификаторов

### Разбиение на train / test

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier



df = pd.read_csv('train.csv')

y = df['Survived']
X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
X.fillna(-1, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [89]:
def get_params_combination(param_grid):
    iterator = product(*[v if isinstance(v, Iterable) else [v] for v in param_grid.values()])
    return [dict(zip(param_grid.keys(), values)) for values in iterator]
    
    

In [90]:
classifiers = [LogisticRegression,
               KNeighborsClassifier,
               #GradientBoostingClassifier(), 
               RandomForestClassifier] 
#               SVC()] # 

classifiers_name = ['LogisticRegression',
    
                    'KNeighborsClassifier',
                    #'GradientBoostingClassifier', 
                    'RandomForestClassifier'] 
#                    'SVC']

In [91]:
# Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
n_folds = 5
scores = []
fits = []
logistic_params = {'penalty': ('l1', 'l2'),
                   'C': (.01,.1,1,5)}

knn_params = {'n_neighbors': list(range(3, 6, 2))}


gbm_params = {'n_estimators': [100, 300, 500],
              'learning_rate':(0.1, 0.5, 1),
              'max_depth': list(range(3, 6)), 
              'min_samples_leaf': list(range(10, 31, 10))}



forest_params = {'n_estimators': [10, 30, 50],
                 'criterion': ('gini', 'entropy')}

#svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
#params = [logistic_params, knn_params, gbm_params, forest_params]

params = [logistic_params, knn_params ,forest_params]

### Осуществляем перебор и формируем финальную таблицу

In [75]:
classifiers_name[1]

'RandomForestClassifier'

In [37]:
import time

In [92]:
%%time
np.random.seed(0)

df1 = pd.DataFrame()

skf = StratifiedKFold(n_splits=2, random_state=0)

for i , each_classifier in enumerate(classifiers):
    clf = each_classifier
    clf_params = params[i]
    clf_classifiers_name = classifiers_name[i]
    print("classifiers_name", clf_classifiers_name)
    
    for tmp_params in get_params_combination(clf_params):
        print("Параметры: ", tmp_params)
        skf_index = skf.split(X_train, y_train)
        for fold, (train_idx, test_idx) in enumerate(skf_index):
            print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))
            
            # Формируем тренеровочный и валидационный датасет
            X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]
            
            # Получаем модель
            tmp_clf = clf(**tmp_params)

            
            # Замеряем время fit
            start_time = time.time()
            pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
            fit_time = time.time() - start_time
            
            
            # Замеряем время predict
            start_time = time.time()
            pred = tmp_clf.predict(X_test_tmp)
            predict_time = time.time() - start_time
            
            tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))
            
            data = {'model_name' : clf_classifiers_name, 
                    'fold' : fold,
                    'params' : tmp_params_string,
                    'fit_time' : fit_time, 
                    'predict_time' : predict_time,
                    'roc_auc':roc_auc_score(y_test_tmp, pred)
                
            }
            
            # Расширяем другими параметрами
            data.update(tmp_params)
            
            # Формируем финальный датафрейм
            df1 = df1.append(data, ignore_index=True)
            
df1.head()


classifiers_name LogisticRegression
Параметры:  {'penalty': 'l1', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 5}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 1}
Размер тренировочного / тестового датасета:  311 312

Unnamed: 0,C,fit_time,fold,model_name,params,penalty,predict_time,roc_auc,n_neighbors,criterion,n_estimators
0,0.01,0.00231,0.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.000886,0.550844,,,
1,0.01,0.002255,1.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.001248,0.557964,,,
2,0.1,0.002397,0.0,LogisticRegression,"penalty=l1, C=0.1",l1,0.000935,0.607143,,,
3,0.1,0.002266,1.0,LogisticRegression,"penalty=l1, C=0.1",l1,0.000817,0.589907,,,
4,1.0,0.002677,0.0,LogisticRegression,"penalty=l1, C=1",l1,0.000769,0.609254,,,


In [93]:
df1.head(50)

Unnamed: 0,C,fit_time,fold,model_name,params,penalty,predict_time,roc_auc,n_neighbors,criterion,n_estimators
0,0.01,0.00231,0.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.000886,0.550844,,,
1,0.01,0.002255,1.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.001248,0.557964,,,
2,0.1,0.002397,0.0,LogisticRegression,"penalty=l1, C=0.1",l1,0.000935,0.607143,,,
3,0.1,0.002266,1.0,LogisticRegression,"penalty=l1, C=0.1",l1,0.000817,0.589907,,,
4,1.0,0.002677,0.0,LogisticRegression,"penalty=l1, C=1",l1,0.000769,0.609254,,,
5,1.0,0.003026,1.0,LogisticRegression,"penalty=l1, C=1",l1,0.000845,0.59634,,,
6,5.0,0.002978,0.0,LogisticRegression,"penalty=l1, C=5",l1,0.000843,0.630014,,,
7,5.0,0.003145,1.0,LogisticRegression,"penalty=l1, C=5",l1,0.000976,0.610426,,,
8,0.01,0.002491,0.0,LogisticRegression,"penalty=l2, C=0.01",l2,0.001118,0.594212,,,
9,0.01,0.002828,1.0,LogisticRegression,"penalty=l2, C=0.01",l2,0.000984,0.568168,,,


### Объектно-ориентированное программирование

In [4]:
df = pd.read_csv('../ml_data/train.csv')
target_col = 'Survived'

y = df['Survived']
X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
X.fillna(-1, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# Формируем отчет по фичам DF

dict_null = df.isnull().sum().to_dict()
dict_type = df.dtypes.to_dict()

dict_unique = dict()
for column in df.columns:
    dict_unique[column] = len(df[column].unique())


report_df = df.describe(include='all').T.rename_axis('features').reset_index()
report_df['feat_count_null'] = report_df['features'].map(dict_null)
report_df['feat_type'] = report_df['features'].map(dict_type)
report_df['unique'] = report_df['features'].map(dict_unique)



report_df.head(20)

In [None]:
1. Считал данные
2. Вывел информацию о датасете
3. Обучил модель
4. Выбрал лучшую и predict
5. Вывел shapvalues 

In [13]:
class Person:
    first_name = 'First'
    last_name = 'Last'

    @property
    def full_name_1(self):
        return ' '.join([self.first_name, self.last_name])

    def full_name_2(self):
        return ' '.join([self.first_name, self.last_name])
    


In [14]:
p = Person()

In [15]:
p.full_name_1

'First Last'

In [16]:
p.full_name_2()

'First Last'

In [17]:
p.full_name_2

<bound method Person.full_name_2 of <__main__.Person object at 0x7fbc9bd27b10>>

In [None]:
class Rectangle:
    def __init__(self, length, width):
        self.length = length
        self.width = width

    def area(self):
        return self.length* self.width

    def perimeter(self):
        return 2 *self.length + 2* self.width

# Here we declare that the Square class inherits from the Rectangle class
class Square(Rectangle):
    def __init__(self, length):
        super().__init__(length, length)

In [84]:
# def get_params_combination(param_grid):
#     iterator = product(*[v if isinstance(v, Iterable) else [v] for v in param_grid.values()])
#     return [dict(zip(param_grid.keys(), values)) for values in iterator]


In [21]:
class BaseAutoMlEstimator:
    """Base class for all estimators in scikit-learn.
    Notes
    -----
    All estimators should specify all the parameters that can be set
    at the class level in their ``__init__`` as explicit keyword
    arguments (no ``*args`` or ``**kwargs``).
    """
    
    def __init__(self, df, X_train, X_test, y_train, y_test, reports_path='/reports'):
        self.df = df
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.reports_path = reports_path

        
    @classmethod
    def get_params_combination(self, param_grid):
        iterator = product(*[v if isinstance(v, Iterable) else [v] for v in param_grid.values()])
        return [dict(zip(param_grid.keys(), values)) for values in iterator]
    
    
    def get_report_about_features(self,df, reports_path='/reports'):
        
        ### Формируем отчет по фичам DF
        dict_null = df.isnull().sum().to_dict()
        dict_type = df.dtypes.to_dict()

        dict_unique = dict()
        for column in df.columns:
            dict_unique[column] = len(df[column].unique())

        report_df = df.describe(include='all').T.rename_axis('features').reset_index()
        report_df['feat_count_null'] = report_df['features'].map(dict_null)
        report_df['feat_type'] = report_df['features'].map(dict_type)
        report_df['unique'] = report_df['features'].map(dict_unique)
        
        report_df.to_csv('..'+ reports_path + '/features_report.csv', index=False)   
        
        print("Отчет по статистикам по фичам сформирован")
        return report_df

    
    
    def optimize_types(self, df, inplace=False):
    
        np_types = [np.int8 ,np.int16 ,np.int32, np.int64,
                np.uint8 ,np.uint16, np.uint32, np.uint64]

        np_types = [np_type.__name__ for np_type in np_types]
        type_df = pd.DataFrame(data=np_types, columns=['class_type'])

        type_df['min_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).min)
        type_df['max_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).max)
        type_df['range'] = type_df['max_value'] - type_df['min_value']
        type_df.sort_values(by='range', inplace=True)


        for col in df.loc[:, df.dtypes <= np.integer]:
            col_min = df[col].min()
            col_max = df[col].max()
            temp = type_df[(type_df['min_value'] <= col_min) & (type_df['max_value'] >= col_max)]
            optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
            print("Col name : {} Col min_value : {} Col max_value : {} Optimized Class : {}".format(col, col_min, col_max, optimized_class))

            if inplace == 'True':
                df[col] = df[col].astype(optimized_class)

        #df.info()

        #return df
    
    

In [22]:
class AutoMlClassification(BaseAutoMlEstimator):    
    
    def __init__(self, df, X_train, X_test, y_train, y_test, reports_path='/reports'):
        super().__init__(df, X_train, X_test, y_train, y_test, reports_path='/reports')
        
    
    def get_report_about_target(self, df, reports_path='/reports'):
        
        ### Формируем отчет по таргету DF
        
        target_report = df.groupby(target_col).count().reset_index()

        dict_count_traget = df[target_col].value_counts().to_dict()
        dict_count_traget_norm = df[target_col].value_counts(normalize=True).to_dict()

        target_report['count'] = target_report[target_col].map(dict_count_traget)
        target_report['count_norm'] = target_report[target_col].map(dict_count_traget_norm)
        
        target_report.to_csv('..'+ reports_path + '/target_report.csv', index=False)    
        
        print("Отчет по статистикам по таргету сформирован")
        return target_report
    
    
    
    def fit_report(self, X_train, X_test, y_train, y_test, reports_path='/reports'):
        
        
        classifiers = [LogisticRegression,
               KNeighborsClassifier,
               #GradientBoostingClassifier(), 
               RandomForestClassifier] 
#               SVC()] # 

        classifiers_name = ['LogisticRegression',

                            'KNeighborsClassifier',
                            #'GradientBoostingClassifier', 
                            'RandomForestClassifier'] 
        #                    'SVC']
        
        
        # Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
        n_folds = 5
        scores = []
        fits = []
        logistic_params = {'penalty': ('l1', 'l2'),
                           'C': (.01,.1,1,5)}

        knn_params = {'n_neighbors': list(range(3, 6, 2))}


        gbm_params = {'n_estimators': [100, 300, 500],
                      'learning_rate':(0.1, 0.5, 1),
                      'max_depth': list(range(3, 6)), 
                      'min_samples_leaf': list(range(10, 31, 10))}



        forest_params = {'n_estimators': [10, 30, 50],
                         'criterion': ('gini', 'entropy')}

        #svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
        #params = [logistic_params, knn_params, gbm_params, forest_params]

        params = [logistic_params, knn_params ,forest_params]        
        
        
        np.random.seed(0)

        df1 = pd.DataFrame()

        skf = StratifiedKFold(n_splits=2, random_state=0)

        for i , each_classifier in enumerate(classifiers):
            clf = each_classifier
            clf_params = params[i]
            clf_classifiers_name = classifiers_name[i]
            print("classifiers_name", clf_classifiers_name)

            for tmp_params in self.get_params_combination(clf_params):
                print("Параметры: ", tmp_params)
                skf_index = skf.split(X_train, y_train)
                for fold, (train_idx, test_idx) in enumerate(skf_index):
                    print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))

                    # Формируем тренеровочный и валидационный датасет
                    X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
                    y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]

                    # Получаем модель
                    tmp_clf = clf(**tmp_params)

                    # Замеряем время fit
                    start_time = time.time()
                    pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
                    fit_time = time.time() - start_time


                    # Замеряем время predict
                    start_time = time.time()
                    pred = tmp_clf.predict(X_test_tmp)
                    predict_time = time.time() - start_time

                    tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))

                    data = {'model_name' : clf_classifiers_name, 
                            'fold' : fold,
                            'params' : tmp_params_string,
                            'fit_time' : fit_time, 
                            'predict_time' : predict_time,
                            'roc_auc':roc_auc_score(y_test_tmp, pred)

                    }

                    # Расширяем другими параметрами
                    data.update(tmp_params)

                    # Формируем финальный датафрейм
                    df1 = df1.append(data, ignore_index=True)

                    
        df1.to_csv('..'+ reports_path + '/model_report.csv', index=False)  
        print("Отчет по модели сформирован")


    
    
    
    

In [23]:
#reports_path='/reports'

au1 = AutoMlClassification(df, X_train, X_test, y_train, y_test)

au1.get_report_about_target(df)

Отчет по статистикам по таргету сформирован


Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count,count_norm
0,0,549,549,549,549,424,549,549,549,549,68,549,549,0.616162
1,1,342,342,342,342,290,342,342,342,342,136,340,342,0.383838


In [24]:
au1.get_report_about_features(df)

Отчет по статистикам по фичам сформирован


Unnamed: 0,features,count,unique,top,freq,mean,std,min,25%,50%,75%,max,feat_count_null,feat_type
0,PassengerId,891,891,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0,0,int64
1,Survived,891,2,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0,0,int64
2,Pclass,891,3,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0,0,int64
3,Name,891,891,"Kantor, Mrs. Sinai (Miriam Sternin)",1.0,,,,,,,,0,object
4,Sex,891,2,male,577.0,,,,,,,,0,object
5,Age,714,89,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0,177,float64
6,SibSp,891,7,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0,0,int64
7,Parch,891,7,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0,0,int64
8,Ticket,891,681,CA. 2343,7.0,,,,,,,,0,object
9,Fare,891,248,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329,0,float64


In [25]:
au1.optimize_types(df)

Col name : PassengerId Col min_value : 1 Col max_value : 891 Optimized Class : int16
Col name : Survived Col min_value : 0 Col max_value : 1 Optimized Class : int8
Col name : Pclass Col min_value : 1 Col max_value : 3 Optimized Class : int8
Col name : SibSp Col min_value : 0 Col max_value : 8 Optimized Class : int8
Col name : Parch Col min_value : 0 Col max_value : 6 Optimized Class : int8


In [27]:
%%time
au1.fit_report(X_train, X_test, y_train, y_test)

classifiers_name LogisticRegression
Параметры:  {'penalty': 'l1', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 5}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 1}
Размер тренировочного / тестового датасета:  311 312

In [None]:
Wall time: 1.42 s

In [29]:
import category_encoders as ce

ModuleNotFoundError: No module named 'category_encoders'

In [11]:
reports_path = '/reports'

def AutoMLfunction(df, X_train, X_test, y_train, y_test, reports_path='/reports'):
    
    print("dscdscddsc")
    def get_report_about_target(df):
        
        ### Формируем отчет по таргету DF
        
        target_report = df.groupby(target_col).count().reset_index()

        dict_count_traget = df[target_col].value_counts().to_dict()
        dict_count_traget_norm = df[target_col].value_counts(normalize=True).to_dict()

        target_report['count'] = target_report[target_col].map(dict_count_traget)
        target_report['count_norm'] = target_report[target_col].map(dict_count_traget_norm)
        
        target_report.to_csv('..'+ reports_path + '/target_report.csv', index=False)    
        
        print("Отчет по статистикам по таргету сформирован")
        return target_report
    
    
    def get_report_about_features(df):
        
        ### Формируем отчет по фичам DF
        dict_null = df.isnull().sum().to_dict()
        dict_type = df.dtypes.to_dict()

        dict_unique = dict()
        for column in df.columns:
            dict_unique[column] = len(df[column].unique())

        report_df = df.describe(include='all').T.rename_axis('features').reset_index()
        report_df['feat_count_null'] = report_df['features'].map(dict_null)
        report_df['feat_type'] = report_df['features'].map(dict_type)
        report_df['unique'] = report_df['features'].map(dict_unique)
        
        report_df.to_csv('..'+ reports_path + '/features_report.csv', index=False)   
        
        print("Отчет по статистикам по фичам сформирован")
        return report_df
    
    
    def get_params_combination(param_grid):
        iterator = product(*[v if isinstance(v, Iterable) else [v] for v in param_grid.values()])
        return [dict(zip(param_grid.keys(), values)) for values in iterator]
    


    def ml_learning(X_train, X_test, y_train, y_test):
        
        
        classifiers = [LogisticRegression,
               KNeighborsClassifier,
               #GradientBoostingClassifier(), 
               RandomForestClassifier] 
#               SVC()] # 

        classifiers_name = ['LogisticRegression',

                            'KNeighborsClassifier',
                            #'GradientBoostingClassifier', 
                            'RandomForestClassifier'] 
        #                    'SVC']
        
        
        # Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
        n_folds = 5
        scores = []
        fits = []
        logistic_params = {'penalty': ('l1', 'l2'),
                           'C': (.01,.1,1,5)}

        knn_params = {'n_neighbors': list(range(3, 6, 2))}


        gbm_params = {'n_estimators': [100, 300, 500],
                      'learning_rate':(0.1, 0.5, 1),
                      'max_depth': list(range(3, 6)), 
                      'min_samples_leaf': list(range(10, 31, 10))}



        forest_params = {'n_estimators': [10, 30, 50],
                         'criterion': ('gini', 'entropy')}

        #svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
        #params = [logistic_params, knn_params, gbm_params, forest_params]

        params = [logistic_params, knn_params ,forest_params]        
        
        
        np.random.seed(0)

        df1 = pd.DataFrame()

        skf = StratifiedKFold(n_splits=2, random_state=0)

        for i , each_classifier in enumerate(classifiers):
            clf = each_classifier
            clf_params = params[i]
            clf_classifiers_name = classifiers_name[i]
            print("classifiers_name", clf_classifiers_name)

            for tmp_params in get_params_combination(clf_params):
                print("Параметры: ", tmp_params)
                skf_index = skf.split(X_train, y_train)
                for fold, (train_idx, test_idx) in enumerate(skf_index):
                    print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))

                    # Формируем тренеровочный и валидационный датасет
                    X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
                    y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]

                    # Получаем модель
                    tmp_clf = clf(**tmp_params)

                    # Замеряем время fit
                    start_time = time.time()
                    pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
                    fit_time = time.time() - start_time


                    # Замеряем время predict
                    start_time = time.time()
                    pred = tmp_clf.predict(X_test_tmp)
                    predict_time = time.time() - start_time

                    tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))

                    data = {'model_name' : clf_classifiers_name, 
                            'fold' : fold,
                            'params' : tmp_params_string,
                            'fit_time' : fit_time, 
                            'predict_time' : predict_time,
                            'roc_auc':roc_auc_score(y_test_tmp, pred)

                    }

                    # Расширяем другими параметрами
                    data.update(tmp_params)

                    # Формируем финальный датафрейм
                    df1 = df1.append(data, ignore_index=True)

                    
        df1.to_csv('..'+ reports_path + '/model_report.csv', index=False)  
        print("Отчет по модели сформирован")
        
        #df1.head()

    
    
    target_report = get_report_about_target(df)
#    pd.read_csv('../reports/target_report.csv').head()
    
    report_df = get_report_about_features(df)
#    pd.read_csv('../reports/features_report.csv').head()

    report_model =  ml_learning(X_train, y_train, y_train, y_test)

    
    

    
AutoMLfunction(df, X_train, X_test, y_train, y_test)        
        
    
    

dscdscddsc
Отчет по статистикам по таргету сформирован
Отчет по статистикам по фичам сформирован
classifiers_name LogisticRegression
Параметры:  {'penalty': 'l1', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 5}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.1}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  3

/bin/sh: cd..: command not found


In [55]:
### Формируем отчет по таргету DF
target_report = df.groupby(target_col).count().reset_index()

dict_count_traget = df[target_col].value_counts().to_dict()
dict_count_traget_norm = df[target_col].value_counts(normalize=True).to_dict()

target_report['count'] = target_report[target_col].map(dict_count_traget)
target_report['count_norm'] = target_report[target_col].map(dict_count_traget_norm)

target_report.to_csv('../reports/target_report.csv', index=False)


In [56]:
pd.read_csv("../reports/target_report.csv").head()

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count,count_norm
0,0,549,549,549,549,424,549,549,549,549,68,549,549,0.616162
1,1,342,342,342,342,290,342,342,342,342,136,340,342,0.383838


In [40]:
class AutoMl:
    def __init__(self, data, target_col):
        self.data = data
        self.target_col = target_col
    
    def print_df(self):
        print("Информация о DF: ")
        print(self.data.head())
        

In [41]:
au1 = AutoMl(df, 'Survived')


In [42]:
au1.print_df()

Информация о DF: 
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500  

In [None]:
class Distribution:

	def __init__(self, mu=0, sigma=1):
	
		""" Generic distribution class for calculating and 
		visualizing a probability distribution.
	
		Attributes:
			mean (float) representing the mean value of the distribution
			stdev (float) representing the standard deviation of the distribution
			data_list (list of floats) a list of floats extracted from the data file
			"""
		
		self.mean = mu
		self.stdev = sigma
		self.data = []


	def read_data_file(self, file_name):
	
		"""Function to read in data from a txt file. The txt file should have
		one number (float) per line. The numbers are stored in the data attribute.
				
		Args:
			file_name (string): name of a file to read from
		
		Returns:
			None
		
		"""
			
		with open(file_name) as file:
			data_list = []
			line = file.readline()
			while line:
				data_list.append(int(line))
				line = file.readline()
		file.close()
	
		self.data = data_list



# Черновик

In [28]:
df = pd.read_csv('../ml_data/train.csv')

target_col = 'Survived'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df

In [6]:
np_types = [np.int8 ,np.int16 ,np.int32, np.int64,
            np.uint8 ,np.uint16, np.uint32, np.uint64]
np_types = [np_type.__name__ for np_type in np_types]
type_df = pd.DataFrame(data=np_types, columns=['class_type'])
type_df

Unnamed: 0,class_type
0,int8
1,int16
2,int32
3,int64
4,uint8
5,uint16
6,uint32
7,uint64


In [15]:
type_df['min_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).min)
type_df['max_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).max)
type_df['range'] = type_df['max_value'] - type_df['min_value']
type_df.sort_values(by='range', inplace=True)
type_df

Unnamed: 0,class_type,min_value,max_value,range
0,int8,-128,127,255.0
4,uint8,0,255,255.0
1,int16,-32768,32767,65535.0
5,uint16,0,65535,65535.0
2,int32,-2147483648,2147483647,4294967000.0
6,uint32,0,4294967295,4294967000.0
3,int64,-9223372036854775808,9223372036854775807,1.844674e+19
7,uint64,0,18446744073709551615,1.844674e+19


In [27]:
def optimize_types(dataframe, inplace=False):
    
    np_types = [np.int8 ,np.int16 ,np.int32, np.int64,
            np.uint8 ,np.uint16, np.uint32, np.uint64]
    
    np_types = [np_type.__name__ for np_type in np_types]
    type_df = pd.DataFrame(data=np_types, columns=['class_type'])
    
    type_df['min_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).min)
    type_df['max_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).max)
    type_df['range'] = type_df['max_value'] - type_df['min_value']
    type_df.sort_values(by='range', inplace=True)


    for col in dataframe.loc[:, dataframe.dtypes <= np.integer]:
        col_min = dataframe[col].min()
        col_max = dataframe[col].max()
        temp = type_df[(type_df['min_value'] <= col_min) & (type_df['max_value'] >= col_max)]
        optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
        print("Col name : {} Col min_value : {} Col max_value : {} Optimized Class : {}".format(col, col_min, col_max, optimized_class))
        
        if inplace == 'True':
            dataframe[col] = dataframe[col].astype(optimized_class)
    
    #dataframe.info()
    
    return dataframe

#optimize_types(type_df)

In [None]:
def optimize_types(dataframe, inplace=False):
    
    np_types = [np.int8 ,np.int16 ,np.int32, np.int64,
            np.uint8 ,np.uint16, np.uint32, np.uint64]
    
    np_types = [np_type.__name__ for np_type in np_types]
    type_df = pd.DataFrame(data=np_types, columns=['class_type'])
    
    type_df['min_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).min)
    type_df['max_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).max)
    type_df['range'] = type_df['max_value'] - type_df['min_value']
    type_df.sort_values(by='range', inplace=True)


    for col in dataframe.loc[:, dataframe.dtypes <= np.integer]:
        col_min = dataframe[col].min()
        col_max = dataframe[col].max()
        temp = type_df[(type_df['min_value'] <= col_min) & (type_df['max_value'] >= col_max)]
        optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
        print("Col name : {} Col min_value : {} Col max_value : {} Optimized Class : {}".format(col, col_min, col_max, optimized_class))
        
        if inplace == 'True':
            dataframe[col] = dataframe[col].astype(optimized_class)
    
    #dataframe.info()
    
    return dataframe

#optimize_types(type_df)

In [21]:
df = pd.read_csv('../ml_data/train.csv')

target_col = 'Survived'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
optimize_types(df)

Col name : PassengerId Col min_value : 1 Col max_value : 891 Optimized Class : int16
Col name : Survived Col min_value : 0 Col max_value : 1 Optimized Class : int8
Col name : Pclass Col min_value : 1 Col max_value : 3 Optimized Class : int8
Col name : SibSp Col min_value : 0 Col max_value : 8 Optimized Class : int8
Col name : Parch Col min_value : 0 Col max_value : 6 Optimized Class : int8


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int16
Survived       891 non-null int8
Pclass         891 non-null int8
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int8
Parch          891 non-null int8
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int16(1), int8(4), object(5)
memory usage: 54.1+ KB


In [None]:
https://stackoverflow.com/questions/57856010/automatically-optimizing-pandas-dtypes 

In [6]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)



2

In [17]:
print(skf)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)


In [12]:
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

for k, (train_index, test_index) in skf.split(X, y):
    print(k)
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


[1 3]
TRAIN: 0 TEST: 2
[0 2]
TRAIN: 1 TEST: 3


In [None]:
https://metanit.com/python/tutorial/7.2.php 

In [14]:

df = pd.read_csv('../ml_data/train.csv')

y = df['Survived']
X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
X.fillna(-1, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [16]:
# Или другой способ проверить, если пропуски в данных
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [18]:
df.fillna(-1, inplace=True)

In [19]:
# Или другой способ проверить, если пропуски в данных
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Что творится....

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
from sklearn.impute import SimpleImputer
#from sklearn.compose import ColumnTransforme
#from sklearn.compose import make_column_selector


import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

In [355]:

df = pd.read_csv('../ml_data/train.csv')

# y = df['Survived']
# X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
# X.fillna(-1, inplace=True)

y = df['Survived']
X = df.drop(['Survived'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
PassengerId    623 non-null int64
Pclass         623 non-null int64
Name           623 non-null object
Sex            623 non-null object
Age            499 non-null float64
SibSp          623 non-null int64
Parch          623 non-null int64
Ticket         623 non-null object
Fare           623 non-null float64
Cabin          139 non-null object
Embarked       622 non-null object

### Определение классов

In [278]:
# class ColumnSelector():
#     def __init__(self,columns = None):
#         self.columns = columns # array of column names to encode
        
#     def fit(self,X,y=None):
#         return self
    
#     def transform(self, X):                                                           
#         return X[self.columns]
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)

In [280]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(X[self.columns])

In [None]:
class ModifiedSimpleImputer(SimpleImputer):
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)

    
    def transform(self, X):
        return pd.DataFrame(super().transform(X))

In [274]:
class ModifiedFeatureUnion(FeatureUnion):
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)
    def merge_dataframes_by_column(self, X):
        return pd.concat(X, axis="columns", copy=False)
    
    def transform(self, X):
        #X = self.merge_dataframes_by_column(X)
        return pd.DataFrame(super().transform(X))

In [None]:
class MyLEncoder():
    
    def transform(self, X, **fit_params):
        enc = LabelEncoder()
        enc_data = []
        for i in list(X.columns):
            X[i] = X[i].astype(str)
            encc = enc.fit(X[i])
            enc_data.append(encc.transform(X[i]))
        return np.asarray(enc_data).T
    
    def fit_transform(self, X,y=None,  **fit_params):
        self.fit(X,y,  **fit_params)
        return self.transform(X)
    def fit(self, X, y, **fit_params):
        return self 

In [283]:
# good

preprocessor3 = ModifiedFeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', FeatureSelector(columns = cat_features)),
                 ('imputer',  ModifiedSimpleImputer(strategy='constant', fill_value='missing'))
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', FeatureSelector(columns = digits_features)),
                 ('imputer',  ModifiedSimpleImputer(strategy='constant', fill_value=-1)),
                 ('scaler', MinMaxScaler())
            ])),
        ])

preprocessor3.fit(X_train)
X_mod4 = preprocessor3.transform(X_train)

X_mod4

#pd.DataFrame(X_mod4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"Dodge, Master. Washington",male,33638,A34,S,0.5,0,0.0617284,0,0.333333,0.159777
1,"Mitkoff, Mr. Mito",male,349221,missing,S,0.730337,1,0,0,0,0.0154116
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,0.193258,1,0.0246914,0.125,0.166667,0.0217308
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,0.505618,0.5,0.45679,0.125,0.333333,0.0541644
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,0.352809,0.5,0.54321,0.125,0.166667,0.0512366
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,0.119101,1,0.271605,0,0,0.0149318
619,"Cairns, Mr. Alexander",male,113798,missing,S,0.303371,0,0,0,0,0.060508
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,0.966292,1,0.518519,0.25,0,0.0275376
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,0.488764,0,0.185185,0.125,0.333333,0.234224


In [273]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [292]:
imputer_numeric = [ModifiedSimpleImputer]
imputer_numeric_name = ['ModifiedSimpleImputer'] 

simpleimputer_params_numeric = {'strategy': ('constant'),
                               'fill_value': [0, -1]}

params_numeric = [simpleimputer_params_numeric]        






imputer_cat = [ModifiedSimpleImputer]
imputer_cat_name = ['ModifiedSimpleImputer'] 

simpleimputer_params_cat = {'strategy': ('constant'),
                          'fill_value': 'missing'}
params_cat = [simpleimputer_params_cat]        



In [341]:
X_train[cat_features].fillna(-1, inplace=True)

In [340]:
X_train[cat_features]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
445,"Dodge, Master. Washington",male,33638,A34,S
650,"Mitkoff, Mr. Mito",male,349221,,S
172,"Johnson, Miss. Eleanor Ileen",female,347742,,S
450,"West, Mr. Edwy Arthur",male,C.A. 34651,,S
314,"Hart, Mr. Benjamin",male,F.C.C. 13529,,S
...,...,...,...,...,...
106,"Salkjelsvik, Miss. Anna Kristine",female,343120,,S
270,"Cairns, Mr. Alexander",male,113798,,S
860,"Hansen, Mr. Claus Peter",male,350026,,S
435,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S


In [343]:
# good
import category_encoders as ce
#ce.TargetEncoder(cols=[...])
#ce.TargetEncoder(cols=cat_features)

X_train[cat_features].fillna('missing', inplace=True)

pipeline5 = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', ModifiedFeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                 ('label_encoding', MyLEncoder())
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1)),
                 ('scaler', MinMaxScaler())
            ])),
        ])),
    # Use model fit
    ('model_fitting', RandomForestClassifier()),
])

pipeline5.fit(X_train, y_train)
pred = pipeline5.predict_proba(X_test)

In [344]:
pred 

array([[0.2, 0.8],
       [0.6, 0.4],
       [0.3, 0.7],
       [0. , 1. ],
       [0.3, 0.7],
       [0.1, 0.9],
       [0. , 1. ],
       [0.4, 0.6],
       [0.2, 0.8],
       [0. , 1. ],
       [0.3, 0.7],
       [0.8, 0.2],
       [0.3, 0.7],
       [0.2, 0.8],
       [0.7, 0.3],
       [0. , 1. ],
       [0.5, 0.5],
       [0.1, 0.9],
       [0.8, 0.2],
       [0.7, 0.3],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.4, 0.6],
       [0.5, 0.5],
       [0.3, 0.7],
       [0.4, 0.6],
       [0.7, 0.3],
       [0.6, 0.4],
       [0.4, 0.6],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.2, 0.8],
       [0.7, 0.3],
       [0.4, 0.6],
       [0.2, 0.8],
       [0.3, 0.7],
       [0.6, 0.4],
       [0. , 1. ],
       [0.2, 0.8],
       [0.5, 0.5],
       [0.4, 0.6],
       [0.1, 0.9],
       [0.3, 0.7],
       [0.8, 0.2],
       [0.3, 0.7],
       [0.2, 0.8],
       [0.3, 0.7],
       [0.5, 0.5],
       [0.2, 0.8],
       [0.4, 0.6],
       [0.1, 0.9],
       [0. , 1. ],
       [0.7,

In [287]:
def get_params_combination(param_grid):
    iterator = product(*[v if isinstance(v, Iterable) else [v] for v in param_grid.values()])
    return [dict(zip(param_grid.keys(), values)) for values in iterator]

In [289]:

df = pd.read_csv('../ml_data/train.csv')

y = df['Survived']
X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
X.fillna(-1, inplace=True)

# y = df['Survived']
# X = df.drop(['Survived'], axis=1)



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [346]:
scaler_params_numeric = {'feature_range': [(0,1) , (2,3)]}



get_params_combination(scaler_params_numeric)

[{'feature_range': (0, 1)}, {'feature_range': (2, 3)}]

In [307]:
forest_params = {'n_estimators': [10, 30],
                 'criterion': ('gini', 'entropy')}

get_params_combination(forest_params)

[{'n_estimators': 10, 'criterion': 'gini'},
 {'n_estimators': 10, 'criterion': 'entropy'},
 {'n_estimators': 30, 'criterion': 'gini'},
 {'n_estimators': 30, 'criterion': 'entropy'}]

In [None]:
scaler_list = [MinMaxScaler]
scaler_name = ['MinMaxScaler'] 

scaler_params_numeric = {'feature_range': [(0,1) , (2,3)]}
params_scaler_list = [simplescaler_params_numeric]        



In [391]:
%%time
# cool 

classifiers = [LogisticRegression,
       #KNeighborsClassifier,
       #GradientBoostingClassifier(), 
       RandomForestClassifier] 
#               SVC()] # 

classifiers_name = ['LogisticRegression',

                    #'KNeighborsClassifier',
                    #'GradientBoostingClassifier', 
                    'RandomForestClassifier'] 
#                    'SVC']


# Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
n_folds = 5
scores = []
fits = []
logistic_params = {'penalty': ('l1', 'l2'),
                   'C': (.01,5)}

knn_params = {'n_neighbors': list(range(3, 6, 2))}


gbm_params = {'n_estimators': [100, 300, 500],
              'learning_rate':(0.1, 0.5, 1),
              'max_depth': list(range(3, 6)), 
              'min_samples_leaf': list(range(10, 31, 10))}



forest_params = {'n_estimators': [10, 30],
                 'criterion': ('gini', 'entropy')}

#svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
#params = [logistic_params, knn_params, gbm_params, forest_params]

params = [logistic_params, forest_params]        



############# Заполнение NaN #############
imputer_numeric_list = [ModifiedSimpleImputer]
imputer_numeric_name = ['ModifiedSimpleImputer'] 

simpleimputer_params_numeric = {'fill_value': [0, -1],
                                'strategy': ['constant']}
params_numeric_list = [simpleimputer_params_numeric]        





imputer_cat_list = [ModifiedSimpleImputer]
imputer_cat_name = ['ModifiedSimpleImputer'] 

simpleimputer_params_cat = {'strategy': ['constant'],
                           'fill_value': ['missing']}
params_cat_list = [simpleimputer_params_cat]        

############# Заполнение NaN #############



############# Scaler ################

scaler_list = [MinMaxScaler]
scaler_name_list = ['MinMaxScaler'] 

scaler_params = {'feature_range': [(0,1) , (2,3)]}
params_scaler_list = [scaler_params]        


#####################################


np.random.seed(0)




np.random.seed(0)

df1 = pd.DataFrame()

skf = StratifiedKFold(n_splits=3, random_state=0)


# for i , each_imputer_cat in enumerate(imputer_cat_list):
#     imputer_cat = each_imputer_cat
#     imputer_cat_name = imputer_cat_name[i]
#     imputer_cat_params = params_cat_list[i]
#     print("imputer_cat_name", imputer_cat_name)    
#     for tmp_imputer_cat_params in get_params_combination(imputer_cat_params):
#         print("Параметры: ", tmp_imputer_cat_params)


for i , each_imputer_numeric in enumerate(imputer_numeric_list):
    imputer_numeric = each_imputer_numeric
    imputer_numeric_name = imputer_numeric_name[i]
    imputer_numeric_params = params_numeric_list[i]
    print("imputer_numeric_name fill_na", imputer_numeric_name)    
    for current_imputer_numeric_params in get_params_combination(imputer_numeric_params):
        print('\n', "Параметры: ", tmp_imputer_numeric_params)


        for i , each_scaler in enumerate(scaler_list):
            scaler = each_scaler
            scaler_name = scaler_name_list[i]
            scaler_params = params_scaler_list[i]
            print("scaler_name", scaler_name,)    
            for current_scaler_param in get_params_combination(scaler_params):
                print('\n', "Параметры scaler: ", current_scaler_param, '\n')

                for i , each_classifier in enumerate(classifiers):
                    clf = each_classifier
                    clf_params = params[i]
                    clf_classifiers_name = classifiers_name[i]
                    print("classifiers_name", clf_classifiers_name,)

                    for tmp_params in get_params_combination(clf_params):
                        print("Параметры classifiers: ", tmp_params)
                        skf_index = skf.split(X_train, y_train)
                        for fold, (train_idx, test_idx) in enumerate(skf_index):
                            print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))

                            # Формируем тренеровочный и валидационный датасет
                            X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
                            y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]

                            # Получаем модель
                            #tmp_clf = clf(**tmp_params)

                            tmp_clf = Pipeline([
                                    # Use FeatureUnion to combine the features
                                    ('union', ModifiedFeatureUnion(
                                        transformer_list=[
                                             # categorical features
                                            ('categorical', Pipeline([
                                                 ('selector', ColumnSelector(columns = cat_features)),
                                                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                                                 ('label_encoding', MyLEncoder())
                                            ])),
                                            # numeric features
                                            ('numeric', Pipeline([
                                                 ('selector', ColumnSelector(columns = digits_features)),
                                                 ('imputer', imputer_numeric(**current_imputer_numeric_params)),
                                                 ('scaler', scaler(**current_scaler_param))
                                            ])),
                                        ])),
                                    # Use model fit
                                    ('model_fitting', clf(**tmp_params)),
                                ])



                            # Замеряем время fit
                            start_time = time.time()
                            pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
                            fit_time = time.time() - start_time


                            # Замеряем время predict
                            start_time = time.time()
                            pred = tmp_clf.predict(X_test_tmp)
                            predict_time = time.time() - start_time

                            clf_tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))
                            scale_tmp_params_string = ", ".join(("{}={}".format(*i) for i in current_scaler_param.items()))
                            imputer_numeric_params_string = ", ".join(("{}={}".format(*i) for i in current_imputer_numeric_params.items()))


                            data = {'classifier_name' : clf_classifiers_name,
                                    'classifier_params' : clf_tmp_params_string,
                                    'scaler_name': scaler_name,
                                    'scaler_params': scale_tmp_params_string,
                                    
                                    'imputer_name': imputer_numeric_name,
                                    'imputer_params': imputer_numeric_params_string,

                                    'fold' : fold,
                                    'fit_time' : fit_time, 
                                    'predict_time' : predict_time,
                                    'roc_auc':roc_auc_score(y_test_tmp, pred)

                            }

                            # Расширяем другими параметрами
                            data.update(tmp_params) # параметры классификатора
                            data.update(current_scaler_param) # параметры scale
                            data.update(current_imputer_numeric_params)

                            # Формируем финальный датафрейм
                            df1 = df1.append(data, ignore_index=True)
            
df1.head()


imputer_numeric_name fill_na ModifiedSimpleImputer

 Параметры:  {'fill_value': -1, 'strategy': 'constant'}
scaler_name MinMaxScaler

 Параметры scaler:  {'feature_range': (0, 1)} 

classifiers_name LogisticRegression
Параметры classifiers:  {'penalty': 'l1', 'C': 0.01}
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  416 207
Параметры classifiers:  {'penalty': 'l1', 'C': 5}
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  416 207
Параметры classifiers:  {'penalty': 'l2', 'C': 0.01}
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  416 207
Параметры classifiers:  {'penalty': 'l2', 'C': 5}
Размер тренировочного / тестового датасета:  415 208
Размер тренировочного / тестового датасета:  4

Unnamed: 0,C,classifier_name,classifier_params,feature_range,fill_value,fit_time,fold,imputer_name,imputer_params,penalty,predict_time,roc_auc,scaler_name,scaler_params,strategy,criterion,n_estimators
0,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.017318,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.011186,0.501537,MinMaxScaler,"feature_range=(0, 1)",constant,,
1,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.017118,1.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.010658,0.5,MinMaxScaler,"feature_range=(0, 1)",constant,,
2,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.016545,2.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.011402,0.507942,MinMaxScaler,"feature_range=(0, 1)",constant,,
3,5.0,LogisticRegression,"penalty=l1, C=5","(0, 1)",0.0,0.019154,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.010974,0.788441,MinMaxScaler,"feature_range=(0, 1)",constant,,
4,5.0,LogisticRegression,"penalty=l1, C=5","(0, 1)",0.0,0.019202,1.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.0112,0.743779,MinMaxScaler,"feature_range=(0, 1)",constant,,


In [390]:
df1['roc_auc'].unique()

array([0.5       , 0.5028394 , 0.8083216 , 0.75570098, 0.51724138,
       0.54962289, 0.80401126, 0.75158339, 0.74434339, 0.6455841 ,
       0.71315439, 0.75818086, 0.7635315 , 0.74480999, 0.76022626,
       0.80163617, 0.75314996, 0.59852217, 0.56579414, 0.80418719,
       0.70971147, 0.75587844, 0.6317734 , 0.74312334, 0.72677692,
       0.75002218, 0.74859254, 0.75692103, 0.75135315, 0.74041168,
       0.747937  , 0.71727657, 0.74747116, 0.71305419, 0.74387755,
       0.76231527, 0.76428571, 0.76530612, 0.7663709 , 0.69018297,
       0.74358917, 0.74938424, 0.74822538, 0.7690007 ])

In [392]:
df1.head(20)

Unnamed: 0,C,classifier_name,classifier_params,feature_range,fill_value,fit_time,fold,imputer_name,imputer_params,penalty,predict_time,roc_auc,scaler_name,scaler_params,strategy,criterion,n_estimators
0,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.017318,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.011186,0.501537,MinMaxScaler,"feature_range=(0, 1)",constant,,
1,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.017118,1.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.010658,0.5,MinMaxScaler,"feature_range=(0, 1)",constant,,
2,0.01,LogisticRegression,"penalty=l1, C=0.01","(0, 1)",0.0,0.016545,2.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.011402,0.507942,MinMaxScaler,"feature_range=(0, 1)",constant,,
3,5.0,LogisticRegression,"penalty=l1, C=5","(0, 1)",0.0,0.019154,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.010974,0.788441,MinMaxScaler,"feature_range=(0, 1)",constant,,
4,5.0,LogisticRegression,"penalty=l1, C=5","(0, 1)",0.0,0.019202,1.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.0112,0.743779,MinMaxScaler,"feature_range=(0, 1)",constant,,
5,5.0,LogisticRegression,"penalty=l1, C=5","(0, 1)",0.0,0.019096,2.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l1,0.010259,0.776124,MinMaxScaler,"feature_range=(0, 1)",constant,,
6,0.01,LogisticRegression,"penalty=l2, C=0.01","(0, 1)",0.0,0.016302,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l2,0.010101,0.538961,MinMaxScaler,"feature_range=(0, 1)",constant,,
7,0.01,LogisticRegression,"penalty=l2, C=0.01","(0, 1)",0.0,0.016115,1.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l2,0.010059,0.519481,MinMaxScaler,"feature_range=(0, 1)",constant,,
8,0.01,LogisticRegression,"penalty=l2, C=0.01","(0, 1)",0.0,0.016003,2.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l2,0.010725,0.532468,MinMaxScaler,"feature_range=(0, 1)",constant,,
9,5.0,LogisticRegression,"penalty=l2, C=5","(0, 1)",0.0,0.01702,0.0,ModifiedSimpleImputer,"fill_value=0, strategy=constant",l2,0.010534,0.792257,MinMaxScaler,"feature_range=(0, 1)",constant,,


In [362]:
scaler_params =  {'feature_range': (0, 1)}
    
minmax = MinMaxScaler(**scaler_params)

minmax.fit(X_train[digits_features])

MinMaxScaler(copy=True, feature_range=(0, 1))

{'n_estimators': 30, 'criterion': 'entropy'}

In [None]:
# good
tmp_clf = Pipeline([
        # Use FeatureUnion to combine the features
        ('union', ModifiedFeatureUnion(
            transformer_list=[
                 # categorical features
                ('categorical', Pipeline([
                     ('selector', ColumnSelector(columns = cat_features)),
                     ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                     ('label_encoding', MyLEncoder())
                ])),
                # numeric features
                ('numeric', Pipeline([
                     ('selector', ColumnSelector(columns = digits_features)),
                     ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1)),
                     ('scaler', MinMaxScaler())
                ])),
            ])),
        # Use model fit
        ('model_fitting', clf(**tmp_params)),
    ])



In [378]:
df = pd.read_csv('../ml_data/train.csv')

#y = df['Survived']
#X = df.select_dtypes(include=['number']).drop(['Survived'], axis=1)
#X.fillna(-1, inplace=True)

y = df['Survived']
X = df.drop(['Survived'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [375]:



np.random.seed(0)

df1 = pd.DataFrame()

skf = StratifiedKFold(n_splits=2, random_state=0)

for i , each_classifier in enumerate(classifiers):
    clf = each_classifier
    clf_params = params[i]
    clf_classifiers_name = classifiers_name[i]
    print("classifiers_name", clf_classifiers_name)
    
    for tmp_params in get_params_combination(clf_params):
        print("Параметры: ", tmp_params)
        skf_index = skf.split(X_train, y_train)
        for fold, (train_idx, test_idx) in enumerate(skf_index):
            print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))
            
            # Формируем тренеровочный и валидационный датасет
            X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]
            
            # Получаем модель
            tmp_clf = clf(**tmp_params)

            
            # Замеряем время fit
            start_time = time.time()
            pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
            fit_time = time.time() - start_time
            
            
            # Замеряем время predict
            start_time = time.time()
            pred = tmp_clf.predict(X_test_tmp)
            predict_time = time.time() - start_time
            
            tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))
            
            data = {'model_name' : clf_classifiers_name, 
                    'fold' : fold,
                    'params' : tmp_params_string,
                    'fit_time' : fit_time, 
                    'predict_time' : predict_time,
                    'roc_auc':roc_auc_score(y_test_tmp, pred)
                
            }
            
            # Расширяем другими параметрами
            data.update(tmp_params)
            
            # Формируем финальный датафрейм
            df1 = df1.append(data, ignore_index=True)
            
df1.head()


classifiers_name LogisticRegression
Параметры:  {'penalty': 'l1', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l1', 'C': 5}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 0.01}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'penalty': 'l2', 'C': 5}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
classifiers_name RandomForestClassifier
Параметры:  {'n_estimators': 10, 'criterion': 'gini'}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'n_estimators': 10, 'criterion': 'entropy'}
Размер тренировочного / тестового датасета:  311 312
Размер тренировочного / тестового датасета:  312 311
Параметры:  {'n_est

Unnamed: 0,C,fit_time,fold,model_name,params,penalty,predict_time,roc_auc,criterion,n_estimators
0,0.01,0.003029,0.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.000907,0.550844,,
1,0.01,0.002496,1.0,LogisticRegression,"penalty=l1, C=0.01",l1,0.00089,0.557964,,
2,5.0,0.003381,0.0,LogisticRegression,"penalty=l1, C=5",l1,0.001553,0.627463,,
3,5.0,0.003574,1.0,LogisticRegression,"penalty=l1, C=5",l1,0.000999,0.610426,,
4,0.01,0.002604,0.0,LogisticRegression,"penalty=l2, C=0.01",l2,0.00083,0.594212,,


In [None]:
############################

In [58]:
# class ColumnSelector():
#     def __init__(self,columns = None):
#         self.columns = columns # array of column names to encode
        
#     def fit(self,X,y=None):
#         return self
    
#     def transform(self, X):                                                           
#         return X[self.columns]

In [110]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [112]:
X[cat_features]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [203]:
class ColumnSelector():
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode
        
    def fit(self,X,y=None):
        return self
    
    def transform(self, X):                                                           
        return X[self.columns]
    
    def fit_transform(self, X,y=None,  **fit_params):
        self.fit(X,y,  **fit_params)
        return self.transform(X)

In [132]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(X[self.columns], columns = X.columns)

In [128]:
class MyLEncoder():
    
    def transform(self, X, **fit_params):
        enc = LabelEncoder()
        enc_data = []
        for i in list(X.columns):
            X[i] = X[i].astype(str)
            encc = enc.fit(X[i])
            enc_data.append(encc.transform(X[i]))
        return np.asarray(enc_data).T
    
    def fit_transform(self, X,y=None,  **fit_params):
        self.fit(X,y,  **fit_params)
        return self.transform(X)
    def fit(self, X, y, **fit_params):
        return self 

In [210]:
class ModifiedSimpleImputer(SimpleImputer):
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)

    
    def transform(self, X):
        return pd.DataFrame(super().transform(X))

In [212]:
class ModifiedFeatureUnion(FeatureUnion):
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)
    def merge_dataframes_by_column(self, X):
        return pd.concat(X, axis="columns", copy=False)
    
    def transform(self, X):
        X = self.merge_dataframes_by_column(X)
        return pd.DataFrame(super().transform(X), columns = X.columns)

In [93]:
# class ModifiedSimpleImputer(SimpleImputer):
#     def transform(self, X):
#         return super().transform(X).flatten()

In [219]:
from mymod import parallel_run
lbview.map(parallel_run, inputs)

ModuleNotFoundError: No module named 'mymod'

In [227]:
import numpy as np
import pandas as pd
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from scipy import sparse



In [262]:
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs =  self.merge_dataframes_by_column(Xs)
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [232]:
class MyPandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, weight=None,**fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, weight, X, y,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, weight, X)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [263]:
preprocessor2 = PandasFeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer',  ModifiedSimpleImputer(strategy='constant', fill_value='missing'))
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer',  ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ])

preprocessor2.fit(X_train)
X_mod4 = preprocessor2.transform(X_train)

X_mod4

#pd.DataFrame(X_mod4)

Unnamed: 0,0,1,2,3,4,0.1,1.1,2.1,3.1,4.1,5
0,"Dodge, Master. Washington",male,33638,A34,S,446.0,1.0,4.0,0.0,2.0,81.8583
1,"Mitkoff, Mr. Mito",male,349221,missing,S,651.0,3.0,-1.0,0.0,0.0,7.8958
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,173.0,3.0,1.0,1.0,1.0,11.1333
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,451.0,2.0,36.0,1.0,2.0,27.7500
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,315.0,2.0,43.0,1.0,1.0,26.2500
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,107.0,3.0,21.0,0.0,0.0,7.6500
619,"Cairns, Mr. Alexander",male,113798,missing,S,271.0,1.0,-1.0,0.0,0.0,31.0000
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,861.0,3.0,41.0,2.0,0.0,14.1083
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,436.0,1.0,14.0,1.0,2.0,120.0000


In [249]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [224]:
pipeline3 = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', FeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ]))
])

pipeline3.fit(X_train)
X_mod5 = pipeline3.transform(X_train)
pd.DataFrame(X_mod5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"Dodge, Master. Washington",male,33638,A34,S,446,1,4,0,2,81.8583
1,"Mitkoff, Mr. Mito",male,349221,missing,S,651,3,-1,0,0,7.8958
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,173,3,1,1,1,11.1333
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,451,2,36,1,2,27.75
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,315,2,43,1,1,26.25
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,107,3,21,0,0,7.65
619,"Cairns, Mr. Alexander",male,113798,missing,S,271,1,-1,0,0,31
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,861,3,41,2,0,14.1083
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,436,1,14,1,2,120


In [226]:
pipeline3.named_steps['union'].get_feature_names()

AttributeError: Transformer categorical (type Pipeline) does not provide get_feature_names.

In [239]:
#import pandas as pd
#from sklearn.datasets import load_iris
#from sklearn.pipeline import make_pipeline

#from pandas_feature_union import PandasFeatureUnion

In [259]:
preprocessor2 = PandasFeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', SimpleImputer(strategy='constant', fill_value='missing'))
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', SimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ])

preprocessor2.fit(X_train)
X_mod4 = preprocessor2.transform(X_train)

X_mod4

#pd.DataFrame(X_mod4)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [205]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [137]:
# Выбираем категориальный фичи ( важно, что для класса MyLEncoder они преобразуются в string, иначе он работать не будет)
# digits_features = df.select_dtypes(include=['number']).columns.values.tolist()
# digits_features.remove('Survived')

# cat_features = df.select_dtypes(include=['object', ]).columns.values.tolist()



digits_features = X.select_dtypes(include=['number']).columns.values.tolist()
cat_features = X.select_dtypes(include=['object', ]).columns.values.tolist()


In [75]:
pipeline = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', FeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                 ('label_encoding', MyLEncoder())
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ])),
    # Use model fit
    ('model_fitting', RandomForestClassifier()),
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict_proba(X_test)

In [None]:
import category_encoders as ce
from category_encoders import 

#encoder = ce.TargetEncoder(cols=[...])

In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

In [180]:
class ModifiedColumnTransformer(ColumnTransformer):
    
#     def fit_transform(self, X,y=None,  **fit_params):
#         self.fit(X,y,  **fit_params)
#         return self.transform(X)

    
    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns = X.columns)

In [None]:
mymy

In [195]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', MinMaxScaler())])

# Combine categorical and numerical pipelines
preprocessor = ModifiedColumnTransformer(transformers=[('cat', cat_pipe, cat_features),
                                               ('num', num_pipe, digits_features)])


preprocessor.fit(X_train)
X_mod3 = preprocessor.transform(X_train)

#columns_cat_new = get_column_names_from_ColumnTransformer(preprocessor) 

#X_mod3[cat_features + digits_features]
#pd.DataFrame(X_mod3)

X_mod3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"Dodge, Master. Washington",male,33638,A34,S,0.5,0,0.0449862,0,0.333333,0.159777
1,"Mitkoff, Mr. Mito",male,349221,missing,S,0.730337,1,0.346569,0,0,0.0154116
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,0.193258,1,0.00728826,0.125,0.166667,0.0217308
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,0.505618,0.5,0.447097,0.125,0.333333,0.0541644
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,0.352809,0.5,0.535059,0.125,0.166667,0.0512366
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,0.119101,1,0.258608,0,0,0.0149318
619,"Cairns, Mr. Alexander",male,113798,missing,S,0.303371,0,0.346569,0,0,0.060508
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,0.966292,1,0.509927,0.25,0,0.0275376
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,0.488764,0,0.170646,0.125,0.333333,0.234224


In [186]:
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name


get_column_names_from_ColumnTransformer(preprocessor)


['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [158]:
X_mod3.shape

(623, 11)

In [159]:
X_train.shape

(623, 11)

In [160]:
pd.DataFrame(X_mod3, columns = X_train.columns)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Dodge, Master. Washington",male,33638,A34,S,0.5,0,0.0449862,0,0.333333,0.159777
1,"Mitkoff, Mr. Mito",male,349221,missing,S,0.730337,1,0.346569,0,0,0.0154116
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,0.193258,1,0.00728826,0.125,0.166667,0.0217308
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,0.505618,0.5,0.447097,0.125,0.333333,0.0541644
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,0.352809,0.5,0.535059,0.125,0.166667,0.0512366
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,0.119101,1,0.258608,0,0,0.0149318
619,"Cairns, Mr. Alexander",male,113798,missing,S,0.303371,0,0.346569,0,0,0.060508
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,0.966292,1,0.509927,0.25,0,0.0275376
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,0.488764,0,0.170646,0.125,0.333333,0.234224


In [147]:
preprocessor =  ColumnTransformer(
        transformers=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', FeatureSelector(columns = cat_features)),
       
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', FeatureSelector(columns = digits_features)),

            ])),
        ])

preprocessor.fit(X_train)
X_mod3 = preprocessor.transform(X_train)

X_mod3.head()

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
ModifiedColumnTransformer

In [None]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
                     ('scaler', MinMaxScaler())])

# Combine categorical and numerical pipelines
preprocessor = ModifiedColumnTransformer(transformers=[('cat', cat_pipe, cat_features),
                                               ('num', num_pipe, digits_features)])


preprocessor.fit(X_train)
X_mod3 = preprocessor.transform(X_train)

X_mod3
#pd.DataFrame(X_mod3, columns = X_train.columns)

In [173]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [None]:
ModifiedColumnTransformer

In [181]:
# Define categorical pipeline
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])

# Define numerical pipeline
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value=-1))
                     ])

# Combine categorical and numerical pipelines
preprocessor = ModifiedColumnTransformer(transformers=[('cat', cat_pipe, cat_features),
                                               ('num', num_pipe, digits_features)])


preprocessor.fit(X_train)
X_mod3 = preprocessor.transform(X_train)


X_mod3
#X_mod3[X_train.columns]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"Dodge, Master. Washington",male,33638,A34,S,446,1,4,0,2,81.8583
1,"Mitkoff, Mr. Mito",male,349221,missing,S,651,3,-1,0,0,7.8958
2,"Johnson, Miss. Eleanor Ileen",female,347742,missing,S,173,3,1,1,1,11.1333
3,"West, Mr. Edwy Arthur",male,C.A. 34651,missing,S,451,2,36,1,2,27.75
4,"Hart, Mr. Benjamin",male,F.C.C. 13529,missing,S,315,2,43,1,1,26.25
...,...,...,...,...,...,...,...,...,...,...,...
618,"Salkjelsvik, Miss. Anna Kristine",female,343120,missing,S,107,3,21,0,0,7.65
619,"Cairns, Mr. Alexander",male,113798,missing,S,271,1,-1,0,0,31
620,"Hansen, Mr. Claus Peter",male,350026,missing,S,861,3,41,2,0,14.1083
621,"Carter, Miss. Lucile Polk",female,113760,B96 B98,S,436,1,14,1,2,120


In [179]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [169]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [198]:
preprocessor2 = FeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing'))
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ])

preprocessor2.fit(X_train)
X_mod4 = preprocessor2.transform(X_train)


X_mod4

AttributeError: 'ModifiedSimpleImputer' object has no attribute 'columns'

In [None]:
pipeline = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', FeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                 ('label_encoding', MyLEncoder())
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ])),
    # Use model fit
    ('model_fitting', RandomForestClassifier()),
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict_proba(X_test)

In [163]:
pipeline2 = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', ModifiedColumnTransformer(
        transformers=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', FeatureSelector(columns = cat_features)),
       
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', FeatureSelector(columns = digits_features)),

            ])),
        ]))
    # Use model
])

pipeline2.fit(X_train)
X_mod = pipeline2.transform(X_train)
X_mod

ValueError: not enough values to unpack (expected 3, got 2)

In [134]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [135]:
X_mod

array([[nan, nan, 'Dodge, Master. Washington', ..., 81.8583, nan, nan],
       [nan, nan, 'Mitkoff, Mr. Mito', ..., 7.8958, nan, nan],
       [nan, nan, 'Johnson, Miss. Eleanor Ileen', ..., 11.1333, nan, nan],
       ...,
       [nan, nan, 'Hansen, Mr. Claus Peter', ..., 14.1083, nan, nan],
       [nan, nan, 'Carter, Miss. Lucile Polk', ..., 120.0, nan, nan],
       [nan, nan, 'White, Mr. Richard Frasar', ..., 77.2875, nan, nan]],
      dtype=object)

In [136]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 445 to 102
Data columns (total 11 columns):
PassengerId    623 non-null int64
Pclass         623 non-null int64
Name           623 non-null object
Sex            623 non-null object
Age            499 non-null float64
SibSp          623 non-null int64
Parch          623 non-null int64
Ticket         623 non-null object
Fare           623 non-null float64
Cabin          139 non-null object
Embarked       622 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 58.4+ KB


In [138]:
digits_features

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [140]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [143]:
fet = FeatureSelector(cat_features)

X_mod2 = fet.fit_transform(X_train)
X_mod2

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,,,"Dodge, Master. Washington",male,,,,33638,,A34,S
650,,,"Mitkoff, Mr. Mito",male,,,,349221,,,S
172,,,"Johnson, Miss. Eleanor Ileen",female,,,,347742,,,S
450,,,"West, Mr. Edwy Arthur",male,,,,C.A. 34651,,,S
314,,,"Hart, Mr. Benjamin",male,,,,F.C.C. 13529,,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,,,"Salkjelsvik, Miss. Anna Kristine",female,,,,343120,,,S
270,,,"Cairns, Mr. Alexander",male,,,,113798,,,S
860,,,"Hansen, Mr. Claus Peter",male,,,,350026,,,S
435,,,"Carter, Miss. Lucile Polk",female,,,,113760,,B96 B98,S


In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(X[self.columns], columns = X.columns)

In [None]:
pipeline2 = Pipeline([
    # Use FeatureUnion to combine the features
    ('union', FeatureUnion(
        transformer_list=[
             # categorical features
            ('categorical', Pipeline([
                 ('selector', ColumnSelector(columns = cat_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value='missing')),
                 ('label_encoding', MyLEncoder())
            ])),
            # numeric features
            ('numeric', Pipeline([
                 ('selector', ColumnSelector(columns = digits_features)),
                 ('imputer', ModifiedSimpleImputer(strategy='constant', fill_value=-1))
            ])),
        ]))
    # Use model
])

X_mod = pipeline2.fit_transform(X_train, y_train)

In [56]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
445,446,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S
650,651,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
172,173,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
450,451,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.7500,,S
314,315,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [42]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [45]:
digits_features

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
classifiers = [LogisticRegression,
       KNeighborsClassifier,
       #GradientBoostingClassifier(), 
       RandomForestClassifier] 
#               SVC()] # 

classifiers_name = ['LogisticRegression',

                    'KNeighborsClassifier',
                    #'GradientBoostingClassifier', 
                    'RandomForestClassifier'] 
#                    'SVC']


# Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
n_folds = 5
scores = []
fits = []
logistic_params = {'penalty': ('l1', 'l2'),
                   'C': (.01,.1,1,5)}

knn_params = {'n_neighbors': list(range(3, 6, 2))}


gbm_params = {'n_estimators': [100, 300, 500],
              'learning_rate':(0.1, 0.5, 1),
              'max_depth': list(range(3, 6)), 
              'min_samples_leaf': list(range(10, 31, 10))}



forest_params = {'n_estimators': [10, 30, 50],
                 'criterion': ('gini', 'entropy')}

#svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
#params = [logistic_params, knn_params, gbm_params, forest_params]

params = [logistic_params, knn_params ,forest_params]        


np.random.seed(0)



In [None]:
# cool 

classifiers = [LogisticRegression,
       KNeighborsClassifier,
       #GradientBoostingClassifier(), 
       RandomForestClassifier] 
#               SVC()] # 

classifiers_name = ['LogisticRegression',

                    'KNeighborsClassifier',
                    #'GradientBoostingClassifier', 
                    'RandomForestClassifier'] 
#                    'SVC']


# Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
n_folds = 5
scores = []
fits = []
logistic_params = {'penalty': ('l1', 'l2'),
                   'C': (.01,.1,1,5)}

knn_params = {'n_neighbors': list(range(3, 6, 2))}


gbm_params = {'n_estimators': [100, 300, 500],
              'learning_rate':(0.1, 0.5, 1),
              'max_depth': list(range(3, 6)), 
              'min_samples_leaf': list(range(10, 31, 10))}



forest_params = {'n_estimators': [10, 30, 50],
                 'criterion': ('gini', 'entropy')}

#svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
#params = [logistic_params, knn_params, gbm_params, forest_params]

params = [logistic_params, knn_params ,forest_params]        


np.random.seed(0)




np.random.seed(0)

df1 = pd.DataFrame()

skf = StratifiedKFold(n_splits=2, random_state=0)

for i , each_classifier in enumerate(classifiers):
    clf = each_classifier
    clf_params = params[i]
    clf_classifiers_name = classifiers_name[i]
    print("classifiers_name", clf_classifiers_name)
    
    for tmp_params in get_params_combination(clf_params):
        print("Параметры: ", tmp_params)
        skf_index = skf.split(X_train, y_train)
        for fold, (train_idx, test_idx) in enumerate(skf_index):
            print("Размер тренировочного / тестового датасета: ", len(train_idx), len(test_idx))
            
            # Формируем тренеровочный и валидационный датасет
            X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]
            
            # Получаем модель
            tmp_clf = clf(**tmp_params)

            
            # Замеряем время fit
            start_time = time.time()
            pred = tmp_clf.fit(X_train_tmp, y_train_tmp)
            fit_time = time.time() - start_time
            
            
            # Замеряем время predict
            start_time = time.time()
            pred = tmp_clf.predict(X_test_tmp)
            predict_time = time.time() - start_time
            
            tmp_params_string = ", ".join(("{}={}".format(*i) for i in tmp_params.items()))
            
            data = {'model_name' : clf_classifiers_name, 
                    'fold' : fold,
                    'params' : tmp_params_string,
                    'fit_time' : fit_time, 
                    'predict_time' : predict_time,
                    'roc_auc':roc_auc_score(y_test_tmp, pred)
                
            }
            
            # Расширяем другими параметрами
            data.update(tmp_params)
            
            # Формируем финальный датафрейм
            df1 = df1.append(data, ignore_index=True)
            
df1.head()


In [None]:

%%time
np.random.seed(42)

for i, each_classifier in enumerate(classifiers):
    clf = each_classifier
    clf_params = params[i]
    clf = Pipeline([('std', StandardScaler()),
                    ('clf', clf)])  
    grid = GridSearchCV(clf, clf_params, 
                        cv=5, 
                        n_jobs=-1, scoring="roc_auc")
    grid.fit(X_train, y_train)
    fits.append(grid.best_params_)
    clf_best_score = grid.best_score_
    scores.append(clf_best_score)
    print(classifiers_name[i], clf_best_score, "\n", grid.best_params_, "\n")

# Черновик

In [None]:
# Настройка параметров выбранных алгоритмов с помощью GridSearchCV 
n_folds = 5
scores = []
fits = []
logistic_params = {'clf__penalty': ('l1', 'l2'),
                   'clf__C': (.01,.1,1,5)}

knn_params = {'clf__n_neighbors': list(range(3, 12, 2))}


gbm_params = {'clf__n_estimators': [100, 300, 500],
              'clf__learning_rate':(0.1, 0.5, 1),
              'clf__max_depth': list(range(3, 6)), 
              'clf__min_samples_leaf': list(range(10, 31, 10))}


forest_params = {'clf__n_estimators': [100, 300, 500],
                 'clf__criterion': ('gini', 'entropy'), 
                 'clf__max_depth': list(range(3, 6)), 
                 'clf__min_samples_leaf': list(range(10, 31, 10))}

#svm_param = {'kernel' : ('linear', 'rbf'), 'C': (.5, 1, 2)} - очень долго считал
params = [logistic_params, knn_params, gbm_params, forest_params]

In [48]:
df.info(null_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [57]:
df.isnull().sum().to_dict()

{'PassengerId': 0,
 'Survived': 0,
 'Pclass': 0,
 'Name': 0,
 'Sex': 0,
 'Age': 177,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': 0,
 'Fare': 0,
 'Cabin': 687,
 'Embarked': 2}

In [11]:
# Или другой способ проверить, если пропуски в данных
sum(df.isnull().sum())

866

In [16]:
df[target_col].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [18]:
df[target_col].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [17]:
# Посмотрим на статистические характеристики:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [68]:
df1 = df.describe(include='all').T.rename_axis('features').reset_index()

df1.head(20)
#df.describe(include='all').T


Unnamed: 0,features,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0
1,Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
2,Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
3,Name,891,891.0,"Nicola-Yarred, Master. Elias",1.0,,,,,,,
4,Sex,891,2.0,male,577.0,,,,,,,
5,Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
6,SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
7,Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
8,Ticket,891,681.0,347082,7.0,,,,,,,
9,Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


In [65]:
dict_null = df.isnull().sum().to_dict()
dict_null

{'PassengerId': 0,
 'Survived': 0,
 'Pclass': 0,
 'Name': 0,
 'Sex': 0,
 'Age': 177,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': 0,
 'Fare': 0,
 'Cabin': 687,
 'Embarked': 2}

In [83]:
df = pd.read_csv('train.csv')
target_col = 'Survived'


df1 = df.describe(include='all').T.rename_axis('features').reset_index()

dict_null = df.isnull().sum().to_dict()
df1['feat_count_null'] = df1['features'].map(dict_null)

dict_type = df.dtypes.to_dict()
df1['feat_type'] = df1['features'].map(dict_type)

df1.head(20)

Unnamed: 0,features,count,unique,top,freq,mean,std,min,25%,50%,75%,max,feat_count_null,feat_type
0,PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0,0,int64
1,Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0,0,int64
2,Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0,0,int64
3,Name,891,891.0,"Nicola-Yarred, Master. Elias",1.0,,,,,,,,0,object
4,Sex,891,2.0,male,577.0,,,,,,,,0,object
5,Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0,177,float64
6,SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0,0,int64
7,Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0,0,int64
8,Ticket,891,681.0,347082,7.0,,,,,,,,0,object
9,Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329,0,float64


Unnamed: 0,features,count,unique,top,freq,mean,std,min,25%,50%,75%,max,feat_count_null,feat_type
0,PassengerId,891,891,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0,0,int64
1,Survived,891,2,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0,0,int64
2,Pclass,891,3,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0,0,int64
3,Name,891,891,"Nicola-Yarred, Master. Elias",1.0,,,,,,,,0,object
4,Sex,891,2,male,577.0,,,,,,,,0,object
5,Age,714,89,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0,177,float64
6,SibSp,891,7,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0,0,int64
7,Parch,891,7,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0,0,int64
8,Ticket,891,681,347082,7.0,,,,,,,,0,object
9,Fare,891,248,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329,0,float64


Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count,count_norm
0,0,549,549,549,549,424,549,549,549,549,68,549,549,0.616162
1,1,342,342,342,342,290,342,342,342,342,136,340,342,0.383838


In [115]:
df[target_col].value_counts().to_dict()

{0: 549, 1: 342}

In [116]:
df[target_col].value_counts(normalize=True).to_dict()

{0: 0.6161616161616161, 1: 0.3838383838383838}

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count,count_norm
0,0,549,549,549,549,424,549,549,549,549,68,549,549,0.616162
1,1,342,342,342,342,290,342,342,342,342,136,340,342,0.383838


In [128]:
df2 = df.copy(deep=True)

#df2.columns = df2.columns.droplevel()
df2.groupby(target_col).count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


In [130]:
df2.columns 

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [135]:
df2.reset_index()

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,549,549,549,549,424,549,549,549,549,68,549
1,1,342,342,342,342,290,342,342,342,342,136,340


In [129]:
df2 = df.groupby(target_col).count()

df2.columns = df2.columns.droplevel()
df2

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.

In [None]:
d = {'key': 'value'}
print(d)
# {'key': 'value'}
d['mynewkey'] = 'mynewvalue'
print(d)
# {'key': 'value', 'mynewkey': 'mynewvalue'}

In [107]:
x = lambda x: (for i in x : print (i))

SyntaxError: invalid syntax (<ipython-input-107-71550d77ad43>, line 1)

In [104]:
lambda d: for column in df.columns d[column] = len(df[column].unique())
d

SyntaxError: invalid syntax (<ipython-input-104-115ef4b0588b>, line 1)

{'PassengerId': 891,
 'Survived': 2,
 'Pclass': 3,
 'Name': 891,
 'Sex': 2,
 'Age': 89,
 'SibSp': 7,
 'Parch': 7,
 'Ticket': 681,
 'Fare': 248,
 'Cabin': 148,
 'Embarked': 4}

In [93]:
len(df['PassengerId'].unique())

891

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 14 columns):
features           12 non-null object
count              12 non-null object
unique             5 non-null object
top                5 non-null object
freq               5 non-null object
mean               7 non-null object
std                7 non-null object
min                7 non-null object
25%                7 non-null object
50%                7 non-null object
75%                7 non-null object
max                7 non-null object
feat_count_null    0 non-null float64
feat_type          0 non-null object
dtypes: float64(1), object(13)
memory usage: 1.4+ KB


In [79]:
df.dtypes.to_dict()

{'features': dtype('O'),
 'count': dtype('O'),
 'unique': dtype('O'),
 'top': dtype('O'),
 'freq': dtype('O'),
 'mean': dtype('O'),
 'std': dtype('O'),
 'min': dtype('O'),
 '25%': dtype('O'),
 '50%': dtype('O'),
 '75%': dtype('O'),
 'max': dtype('O'),
 'feat_count_null': dtype('float64'),
 'feat_type': dtype('O')}

In [None]:
df['D'] = df['U'].map(d)

In [62]:
df1

Unnamed: 0,index,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0
1,Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
2,Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
3,Name,891,891.0,"Nicola-Yarred, Master. Elias",1.0,,,,,,,
4,Sex,891,2.0,male,577.0,,,,,,,
5,Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
6,SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
7,Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
8,Ticket,891,681.0,347082,7.0,,,,,,,
9,Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


In [1]:
df.describe().T , df.isnull().sum()

In [43]:
type(df)

pandas.core.frame.DataFrame

In [None]:
'pandas.core.frame.DataFrame

In [32]:
df.isnull().sum()[::1]

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [35]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


NoneType

In [46]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')