In [1]:
pip install pytest

Note: you may need to restart the kernel to use updated packages.


In [9]:
# model.py
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

def preprocess_data(data):
    def convert_to_numeric(value):
        value = str(value)
        value = re.sub(r'[^\d.]', '', value)
        return pd.to_numeric(value, errors='coerce')

    data['Year'] = data['Year'].apply(convert_to_numeric)
    data['Capacity (cc)'] = data['Capacity (cc)'].apply(convert_to_numeric)
    data['Mileage (Km)'] = data['Mileage (Km)'].apply(convert_to_numeric)
    data['Price (Rs)'] = data['Price (Rs)'].apply(convert_to_numeric)

    label_encoders = {}
    categorical_columns = ['Brand', 'Model', 'Condition', 'Transmission', 'Fuel']

    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

    return data, label_encoders

def split_data(data):
    X = data.drop(columns=['Price (Rs)'])
    y = data['Price (Rs)']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def train_model(x_train, y_train):
    model = RandomForestRegressor(random_state=42)
    model.fit(x_train, y_train)
    return model

def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = model.score(x_test, y_test)
    accuracy_percentage = 100 * (1 - (mae / y_test.mean()))
    
    return {
        "mae": mae,
        "mse": mse,
        "rmse": rmse,
        "r2": r2,
        "accuracy_percentage": accuracy_percentage
    }

def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)


In [11]:
# test_model.py
import pytest
import pandas as pd
from model import load_data, preprocess_data, split_data, train_model, evaluate_model

def test_load_data():
    data = load_data('srilanka_vehicle_data.csv')
    assert isinstance(data, pd.DataFrame)
    assert not data.empty

def test_preprocess_data():
    data = load_data('srilanka_vehicle_data.csv')
    data, _ = preprocess_data(data)
    assert 'Brand' in data.columns
    assert 'Price (Rs)' in data.columns
    assert pd.api.types.is_numeric_dtype(data['Year'])

def test_split_data():
    data = load_data('srilanka_vehicle_data.csv')
    data, _ = preprocess_data(data)
    x_train, x_test, y_train, y_test = split_data(data)
    assert x_train.shape[0] > 0
    assert x_test.shape[0] > 0
    assert y_train.shape[0] > 0
    assert y_test.shape[0] > 0

def test_train_model():
    data = load_data('srilanka_vehicle_data.csv')
    data, _ = preprocess_data(data)
    x_train, x_test, y_train, y_test = split_data(data)
    model = train_model(x_train, y_train)
    assert model is not None

def test_evaluate_model():
    data = load_data('srilanka_vehicle_data.csv')
    data, _ = preprocess_data(data)
    x_train, x_test, y_train, y_test = split_data(data)
    model = train_model(x_train, y_train)
    metrics = evaluate_model(model, x_test, y_test)
    assert 'mae' in metrics
    assert 'mse' in metrics
    assert 'rmse' in metrics
    assert 'r2' in metrics
    assert 'accuracy_percentage' in metrics


ModuleNotFoundError: No module named 'model'