In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

%matplotlib inline
warnings.filterwarnings('ignore')

In [71]:
train_data = pd.read_csv('train.csv')

train_data.sample(10)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
5609,3904,11,3.0,60.451104,37.239782,5.0,4,9.0,1974,0.012339,B,B,35,5776,1,2078.0,2,4,B,230175.180494
7146,239,6,1.0,39.705912,,0.0,16,17.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,114672.768248
2044,12762,90,2.0,45.94929,29.817769,5.0,4,5.0,1964,0.265089,B,B,37,5288,0,1937.0,3,2,B,194240.534503
1455,12507,54,2.0,79.810535,79.578961,78.0,10,15.0,2014,0.006076,B,B,30,5285,0,645.0,6,6,B,438708.707579
6676,2545,55,3.0,78.294289,45.140193,10.0,14,14.0,1997,0.041365,B,B,31,5569,0,,1,4,B,287328.830996
3965,10178,57,1.0,36.081011,17.438633,7.0,4,5.0,1952,0.133215,B,B,49,11395,3,1406.0,3,4,A,160887.096225
3397,2030,10,3.0,62.541787,43.153632,6.0,8,12.0,1972,0.08904,B,B,33,7976,5,,0,11,B,254343.037767
7973,15070,84,1.0,35.472919,23.296678,8.0,2,5.0,1960,0.149666,B,B,22,4789,2,4087.0,4,1,B,154431.437589
4791,5704,62,1.0,53.810968,,0.0,4,0.0,1977,0.072158,B,B,2,629,1,,0,0,A,125797.832599
6393,2965,45,3.0,76.554751,,0.0,25,24.0,1977,0.195781,B,B,23,5212,6,,3,2,B,410641.458036


### Описание датасета
* Id - идентификационный номер квартиры
* DistrictId - идентификационный номер района
* Rooms - количество комнат
* Square - площадь
* LifeSquare - жилая площадь
* KitchenSquare - площадь кухни
* Floor - этаж
* HouseFloor - количество этажей в доме
* HouseYear - год постройки дома
* Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
* Social_1, Social_2, Social_3 - социальные показатели местности
* Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
* Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
* Price - цена квартиры

In [72]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

## Exploratory data analysis

In [73]:
train_data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [74]:
# кол-во значений в признаках
# переведем признаки с небльшим кол-во значений в категориальные
pd.DataFrame({item:len(train_data[item].unique()) for item in train_data[train_data.columns.tolist()].select_dtypes(include=['int64', 'float64']).columns.tolist()}.items(), \
columns = ['Name', 'Count']).sort_values(by='Count').head(13)

Unnamed: 0,Name,Count
14,Helthcare_2,7
2,Rooms,9
15,Shops_1,16
12,Social_3,30
6,Floor,33
7,HouseFloor,44
10,Social_1,51
5,KitchenSquare,58
13,Healthcare_1,80
8,HouseYear,97


### класс конвейера

In [78]:
class DataPipeline:     
    def __init__(self, **kwargs):
        """Инициализация класса"""        
        # уровень стат значимости
        self.alpha = 0.05
        # пороговое значение в три сигмы
        self.threashold = 3              
        # параметры класса
        for key, value in kwargs.items():
            setattr(self, key, value)    
            
        assert self.train_file is not None
        assert self.test_file is not None                
    
    def run_transform(func):
        """Декоратор трансформаций"""
        def wrapper(self):      
            data = pd.read_csv(self.train_file)
            train_data = func(self, data)            
            data = pd.read_csv(self.test_file)
            test_data = func(self, data)             
            
        return wrapper    
    
    @run_transform
    def transform(self, data):       
        """Трансформация данных"""  
        # удаляем id
        data.drop("Id", axis=1, inplace=True)            
        # значений не много переводим в категориальный тип
        data.DistrictId = data.DistrictId.astype("category")
        
        data.Social_1 = data.Social_1.astype("category")
        data.Social_2 = data.Social_2.astype("category")
        data.Social_3 = data.Social_3.astype("category")

        data.Shops_1 = data.Shops_1.astype("category")
        data.Shops_2 = data.Shops_2.astype("category")
        
        data.Healthcare_1 = data.Healthcare_1.astype("category")
        data.Helthcare_2  = data.Helthcare_2.astype("category")

        # обработка категориальных признаков
        for item in self.property_cat(data):        
            data = self._transform_cat_(data, item)            
        # обработка количественных признаков
        for item in self.property_num(data):
            data = self._transform_num_(data, item)
            
        return data
            
    def _transform_cat_(self, data, column):
        """Трансформация категориальных признаков"""                
        # заменяем пропуски модой                                   
        data[column] = data[column].fillna(data[column].mode()[0])                    
        # кодируем признак
        data[column] = LabelEncoder().fit_transform(data[column])
        
        return data
        
    def _transform_num_(self, data, column):
        """Трансформация количественных признаков"""                      
        # запомнинание пропусков
        data['isna'] = np.where(data[column].isna(), True, False)                        
        # заполнение пропусков
        data[column] = data[column].fillna(data[column].mean())
        # стандартизация признака
        if column != self.target:
            data[column] = StandardScaler().fit_transform(data[[column]])                                          
        # обработка выбрасов, вычисляем z-Score        
        z = np.abs(stats.zscore(data[column]))                         
        # среднее если данные имеют нормальное распределение, иначе медиана                                    
        if stats.shapiro(data[column].values.reshape(-1))[1] < self.alpha:                        
            data.loc[z > self.threashold, column] = data[column].median()            
        else:            
            data.loc[z > self.threashold, column] = data[column].mean()                        
        
        # обработка пропусков             
        if True in data['isna'].values:
            # используем простую модель
            model = LinearRegression()
            # формируем обучающие и тестовые наборы
            X_test = data[data['isna']==True][[item for item in data.columns if item != column]]
            X_train = data[data['isna']==False][[item for item in data.columns if item != column]]        
            y_train = data[data['isna']==False][[column]]        
            # обучение модели
            model.fit(X_train, y_train)
            # выполняем предсказание прзнака
            data[data['isna']][column] = model.predict(X_test)            
            
        # винсоризация
        #self.data[column] = winsorize(self.data[column], limits=[0.1, 0.1])                  
        
        return data

    def fit(self):
        """Обучение модели"""    
        display(self.train_data.sample(10))
        display(self.test_data.sample(10))
        
    @property    
    def property_num(self, data):
        """количественные признаки"""
        return data[data.columns.tolist()].select_dtypes(include=['int64', 'float64']).columns.tolist()
        
    @property    
    def property_cat(self, data):
        """номинативные признаки"""
        return data[data.columns.tolist()].select_dtypes(include=['object', 'category']).columns.tolist()
        
    def plot(self, data):      
        """график plot"""
        plt.figure(figsize = (15, 8))                
        for column in self.property_num(data):
            sns.distplot(data[column], bins=50)

            target_mean = round(data[column].mean(), 2)
            target_median = data[column].median()
            target_mode = data[column].mode()[0]        

            y = np.linspace(0, 0.000005, 10)
            plt.plot([target_mean] * 10, y, label='mean', linestyle=':', linewidth=4)
            plt.plot([target_median] * 10, y, label='median', linestyle='--', linewidth=4)
            plt.plot([target_mode] * 10, y, label='mode', linestyle='-.', linewidth=4)

            plt.title(f'Distribution of {column}')
            plt.legend()
            plt.show()        

    def boxplot(self, data, column):
        """график box-plot"""
        plt.figure(figsize = (16, 8))                 
        data.boxplot(column=list(column))              

In [76]:
# создамие конвейера
pipeline = DataPipeline(train_file="train.csv", test_file="test.csv", target='Price')

In [80]:
# распределение количественных признаков в исходных данных и осноные статистические данные в них
pipeline.plot(pipeline.train_data)

AttributeError: 'DataPipeline' object has no attribute 'train_data'

In [81]:
# подготовка признаков
pipeline.transform()

TypeError: property_cat() missing 1 required positional argument: 'data'

In [None]:
pipeline.boxplot((item for item in pipeline.property_num if item != pipeline.target))

In [None]:
# распределение количественных признаков после нормализации данных и осноные статистические данные в них
for item in pipeline.property_num:
    pipeline.plot(item)