In [1]:
import pandas as pd
import numpy as np
import dill

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from datetime import datetime


In [10]:
def filter_data(df):
        df_copy = df.copy()
        columns_to_drop = [
                'id',
                'url',
                'region',
                'region_url',
                'price',
                'manufacturer',
                'image_url',
                'description',
                'posting_date',
                'lat',
                'long'
            ]
        return df_copy.drop(columns=columns_to_drop, axis=1)



def remove_outliers(df):
    df_copy = df.copy()
    def calculate_outliers(data):
        Q1 = np.quantile(data, 0.25)
        Q3 = np.quantile(data, 0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound
    
    df = df.copy()
    
    lower_bound, upper_bound = calculate_outliers(df)
    df[df < lower_bound] = lower_bound
    df[df > upper_bound] = upper_bound
    return df_copy.round()


def create_features(df):
    df_copy = df.copy()
    def short_model(x):
        if not pd.isna(x):
            return x.lower().split(' ')[0]
        else:
            return x
    df=df.copy()
    df['short_model'] = df['model'].apply(short_model)
    df['age_category'] = df['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))
    df.drop('model', axis=1, inplace=True)
    return df_copy
    
    
df = pd.read_csv('homework.csv')
df = pd.DataFrame(df)

# filtering = Pipeline(steps=[
#     ('filter_data', FunctionTransformer(filter_data)),
#     ('create_features', FunctionTransformer(create_features))
#     ])
    
X = df.drop('price_category', axis=1)
y = df['price_category']

    

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('outliers', FunctionTransformer(remove_outliers)),
        ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])
    
    
models = (
        LogisticRegression(random_state=42, max_iter=1000),
        RandomForestClassifier(random_state=42, n_estimators=1000),
        MLPClassifier(random_state=42, max_iter=1000)
    )
    
best_score = .0
best_pipe = None
for model in models:
    pipe = Pipeline(steps=[
#             ('filter', filtering),
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
    score = cross_val_score(pipe, X, y, cv=4, scoring='accuracy', error_score='raise')
    print(f'model: {type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')

    if score.mean() > best_score:
            best_score = score.mean()
            best_pipe = pipe

print(f'best model: {type(best_pipe.named_steps["classifier"]).__name__}, accuracy: {best_score:.4f}')
    
metadata = {
        'name': 'Car price prediction model',
        'author': 'Akty',
        'version': 1,
        'date': datetime.now(),
        'type': type(best_pipe.named_steps["classifier"]).__name__,
        'accuracy': best_score
    }

ValueError: A given column is not a column of the dataframe