In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from preprocessor import Preprocessor
from data_plotter import DataPlotter

from models import BaseLinearModel, LinearModel, Ridge, Lasso

In [None]:
CWD = os.path.abspath('')

In [None]:
train_data_file = Path(CWD).parent / "data/train.csv"
test_data_file = Path(CWD).parent / "data/test.csv"

train_data, test_data = pd.read_csv(train_data_file), pd.read_csv(test_data_file)

train_preprocessor = Preprocessor(train_data)
test_preprocessor = Preprocessor(test_data)

data_plotter = DataPlotter()

In [None]:
nan_columns = train_data.columns[train_data.isnull().sum() > 1]

print(f'Columns that contain nan values - {nan_columns.to_list()}')

data_plotter.plot_barchart(data=train_data.isnull().sum(), title="Nan values by columns", xlabel="Number", ylabel="Columns")

In [None]:
for column in nan_columns.to_list():
    print(f'Unique values for {column} - {train_data[column].unique()}')
    data_plotter.plot_barchart(
        data=train_data[column].value_counts(),
        title=f'Unique values for {column}',
        xlabel="Values",
        ylabel="Number",
        figsize=(10,6)
    )

In [None]:
train_data = train_preprocessor.replace_nan_with_value(columns=["accident", "clean_title"])
train_data = train_preprocessor.replace_value_with_nan(columns=["fuel_type"])

data_plotter.plot_barchart(data=train_data.isnull().sum(), title="Nan values by columns after processing", xlabel="Number", ylabel="Columns")

In [None]:
train_data = train_preprocessor.fill_na_values(column="fuel_type")

In [None]:
train_data["engine"].value_counts()

In [None]:
train_data = train_preprocessor.create_new_features(column="engine")

# This is shit, like 22% is too much, will drop for now

In [None]:
train_data['fuel_mismatch'] = train_data['fuel_type'].str.lower() != train_data['fuel_type_from_engine'].str.lower()

mismatch_count = train_data['fuel_mismatch'].sum()
total_count = len(train_data)

print(f'Mismatched fuel types: {mismatch_count} out of {total_count} ({mismatch_count / total_count:.2%})')

for i in range(len(train_data)):
    if train_data["fuel_mismatch"].iloc[i] and train_data['fuel_type_from_engine'].iloc[i] is not None:
        print(f'{train_data['fuel_type'].iloc[i]}, {train_data['fuel_type_from_engine'].iloc[i]}')

In [None]:
train_data = train_data[~train_data['fuel_mismatch']].reset_index(drop=True)

train_data = train_data.drop(columns=["fuel_mismatch"])

train_data = train_data.assign(fuel_type=train_data["fuel_type_from_engine"]).drop(columns=["fuel_type_from_engine"])

In [None]:
train_data = train_data.dropna(axis=1)
train_data.isnull().sum()
train_data.dtypes

In [None]:
data_plotter.plot_numerical_distribution(data=train_data, columns=["price", "milage", "model_year"])

In [None]:
ctg_modes = {}
for column in train_data.columns:
    if train_data[column].dtype == "int64":
        continue
    if len(train_data[column].unique()) <= 10:
        mode = "one_hot"
    else:
        mode = "default"

    ctg_modes[column] = mode

train_data = Preprocessor(train_data).transform_ctg_to_num(categories=ctg_modes)
train_data.head()

columns = []
data_plotter.default_configs.color = "red"
for column in train_data.columns:
    if train_data[column].dtype == "int64":
        columns.append(column)

train_data.head()