In [281]:
import pandas as pd
from pathlib import Path

In [282]:
def get_data(type_data: str) -> pd.DataFrame:
    file_path = Path().parent / f'data/{type_data}.csv'

    return pd.read_csv(file_path)

In [283]:
data = get_data("train")

In [284]:
def to_numerical(column: pd.Series) -> pd.Series:
    unique_values = column.dropna().unique()  
    mapping = {value: idx for idx, value in enumerate(unique_values, 1)}

    return column.map(mapping)

In [285]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    categorical_columns = [
        "has_pool",
        "orientation",
        "is_furnished",
        "accepts_pets",
        "has_ac",
        "neighborhood"
    ]

    for column in categorical_columns:
        data[column] = to_numerical(column=data[column])

    data["door_a"], data["door_b"] = zip(*data["door"].apply(lambda x: x.split(" - ") if isinstance(x, str) else (float("nan"), float("nan"))))

    data["door_a"] = to_numerical(data["door_a"])
    data["door_b"] = to_numerical(data["door_b"])

    data = data.drop(columns=["door"])

    interpolated_data = data.interpolate(method="linear").ffill().bfill()
    interpolated_data.isna().sum()

    print(interpolated_data.isna().sum())

    return interpolated_data


In [286]:
from sklearn.linear_model import LinearRegression

data = preprocess_data(data=data)

X = data.drop(columns=["price"])
y = data["price"]

model = LinearRegression()

model.fit(X, y)

coefficients = model.coef_
intercept = model.intercept_

coefficients, intercept

id                  0
num_rooms           0
num_baths           0
square_meters       0
orientation         0
year_built          0
is_furnished        0
has_pool            0
neighborhood        0
num_crimes          0
has_ac              0
accepts_pets        0
num_supermarkets    0
price               0
door_a              0
door_b              0
dtype: int64


(array([ 7.62339154e-05,  8.23437103e-01,  9.49561665e+00,  4.44750574e+00,
        -6.73845355e-01, -3.25014854e-02,  2.76519964e+00,  5.52186904e+00,
        -2.75706423e-02, -2.19765842e+01, -5.64376455e+00, -5.28048194e-02,
         1.26235237e+00,  3.30838565e+00, -1.99114694e+00]),
 705.8186574391548)

In [287]:
from sklearn.metrics import mean_squared_error

test_data = get_data("test")
print(len(test_data))
test_data = preprocess_data(data=test_data)
print(len(test_data))

y_pred = model.predict(test_data)

test_predictions_submit = pd.DataFrame({"id": test_data["id"], "price": y_pred})
test_predictions_submit.to_csv("test_predictions_submit.csv", index = False)

2000
id                  0
num_rooms           0
num_baths           0
square_meters       0
orientation         0
year_built          0
is_furnished        0
has_pool            0
neighborhood        0
num_crimes          0
has_ac              0
accepts_pets        0
num_supermarkets    0
door_a              0
door_b              0
dtype: int64
2000
