In [1]:
import re
import pandas as pd
import os
import numpy as np

In [2]:
class InvalidExtension(Exception):
    def __init__(self, extension):
        error_message = f'Расширение .{extension} не поддерживается'
        super().__init__(error_message)
        
def read_file(file_path, separator=';'):
    df = None
    if os.path.exists(file_path):
        filename = file_path.split('//')[-1]
        file_extension = filename.split('.')[-1].lower()
        if file_extension == 'xlsx':
            df = pd.read_excel(file_path)
        elif file_extension == 'csv':
            df = pd.read_csv(file_path, sep=separator, encoding='cp1251')
        else:
            raise InvalidExtension(file_extension)
    else:
        raise FileNotFoundError(f'Файл по пути {file_path} не существует')
    return df

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # исходим из предположения о том, что если это не DataFrame, то это Series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # преобразуем байты в мегабайты
    return "{:03.2f} MB".format(usage_mb)

def frame_memory_worker(mw_df):
    float64_columns =  mw_df.select_dtypes(include="float64").columns
    for float64_column in float64_columns:
        mw_df[float64_column] = mw_df[float64_column].astype('float32')
    
    int64_columns = mw_df.select_dtypes(include="int64").columns
    for int64_column in int64_columns:
        type_check_list = list(
            (mw_df[int64_column]>2147483648) & (mw_df[int64_column]<-2147483648)
        )
        # если ни один элемент не выходит за пределы типа данных int32
        if not any(type_check_list):
            mw_df[int64_column] = mw_df[int64_column].astype('int32')
        else:
            pass
    
    object_columns = mw_df.select_dtypes(include="object").columns
    for object_column in object_columns:
        num_unique_values = len(mw_df[object_column].unique())
        num_total_values = len(mw_df[object_column])
        if num_unique_values / num_total_values < 0.5:
            mw_df[object_column] = mw_df[object_column].astype("category")
    
    return mw_df

def outlier_worker(outlier_df, search_column):
    start_df = outlier_df.copy()
    # Расчет 1 и 3 квантиля
    Q1 = outlier_df[search_column].quantile(0.25)
    Q3 = outlier_df[search_column].quantile(0.75)
    IQR = Q3-Q1
    # Расчет верхней и нижней границы данных
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    # Создание списков bool-значений  для выявления строк с отклонениями
    upper_array = np.where(outlier_df[search_column]>=upper)[0]
    lower_array = np.where(outlier_df[search_column]<=lower)[0]
    # Удаление строк с отклонениями
    outlier_df.drop(index=upper_array, inplace=True)
    outlier_df.drop(index=lower_array, inplace=True)
    return outlier_df

In [None]:

simple_imputer = SimpleImputer(strategy='median')
std_scaler = StandardScaler()
pipe_num = Pipeline([('imputer', simple_imputer), ('scaler', std_scaler)])

res_num = pipe_num.fit_transform(features_train.drop(['ocean_proximity'], axis=1))

s_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
pipe_cat = Pipeline([('imputer', s_imputer), ('encoder', ohe_encoder)])

res_cat = pipe_cat.fit_transform(features_train[['ocean_proximity']])

res_df_cat = pd.DataFrame(res_cat, columns=pipe_cat.get_feature_names_out())
res_df_cat



In [None]:
col_transformer = ColumnTransformer([('num_preproc', pipe_num, [x for x in features.columns if features[x].dtype!='object']),
                                     ('cat_preproc', pipe_cat, [x for x in features.columns if features[x].dtype=='object'])])

In [None]:
model = Ridge()

final_pipe = Pipeline([('preproc', col_transformer),
                       ('model', model)])

In [4]:
df = read_file('battles.csv',',')

In [10]:
mem_usage(df)

'0.02 MB'