In [88]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

#regresja liniowa
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

In [89]:
train_data = pd.read_csv("raw data/pzn-rent-train.csv")
test_data = pd.read_csv("raw data/pzn-rent-test.csv")

In [90]:
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()

numerical_columns.remove('price')

In [91]:
#Tworze imputer, który uzupełnij odpowiednio dane numeryczne
numerical_imputer = SimpleImputer(strategy="median")
train_data[numerical_columns] = numerical_imputer.fit_transform(train_data[numerical_columns])
test_data[numerical_columns] = numerical_imputer.transform(test_data[numerical_columns])

#Tworze imputer, który uzupełnij odpowiednio dane kategoryczne
categorical_imputer = SimpleImputer(strategy="most_frequent")
train_data[categorical_columns] = categorical_imputer.fit_transform(train_data[categorical_columns])
test_data[categorical_columns] = categorical_imputer.transform(test_data[categorical_columns])

In [92]:
#Przekształcenie kolumn na typ daty
train_data['date_activ'] = pd.to_datetime(train_data['date_activ'])
train_data['date_modif'] = pd.to_datetime(train_data['date_modif'])
train_data['date_expire'] = pd.to_datetime(train_data['date_expire'])

test_data['date_activ'] = pd.to_datetime(test_data['date_activ'])
test_data['date_modif'] = pd.to_datetime(test_data['date_modif'])
test_data['date_expire'] = pd.to_datetime(test_data['date_expire'])

# Pozbycie się dat, na konkretna liczbe dni
train_data['active_duration'] = (train_data['date_modif'] - train_data['date_activ']).dt.days
train_data['remaining_days'] = (train_data['date_expire'] - train_data['date_modif']).dt.days

test_data['active_duration'] = (test_data['date_modif'] - test_data['date_activ']).dt.days
test_data['remaining_days'] = (test_data['date_expire'] - test_data['date_modif']).dt.days


train_data.drop(['date_activ', 'date_modif', 'date_expire'], axis=1, inplace=True)
test_data.drop(['date_activ', 'date_modif', 'date_expire'], axis=1, inplace=True)

In [93]:
# Encode categorical variables
categorical_columns = ['individual', 'flat_furnished', 'flat_for_students', 'flat_balcony',
                       'flat_garage', 'flat_basement', 'flat_garden', 'flat_tarrace', 'flat_lift',
                       'flat_two_level', 'flat_kitchen_sep', 'flat_air_cond', 'flat_nonsmokers',
                       'flat_washmachine', 'flat_dishwasher', 'flat_internet', 'flat_television',
                       'flat_anti_blinds', 'flat_monitoring', 'flat_closed_area', 'quarter']

# Apply One-Hot Encoding for categorical features like 'quarter' and Label Encoding for others
train_data = pd.get_dummies(train_data, columns=['quarter'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['quarter'], drop_first=True)

# Label encoding for binary categorical columns
encoder = LabelEncoder()

for col in categorical_columns:
    if col != 'quarter': 
        train_data[col] = encoder.fit_transform(train_data[col])
        test_data[col] = encoder.transform(test_data[col])

In [94]:
#Sprawdzam kolumny w zbiorach
train_columns = train_data.columns
test_columns = test_data.columns


# Dodaj brakujące kolumny do zbioru testowego
for column in train_columns:
    if column not in test_columns:
        test_data[column] = 0

# Dodaj brakujące kolumny do zbioru treningowego
for column in test_columns:
    if column not in train_columns:
        train_data[column] = 0

In [95]:
# X = train_data.drop(columns=["id", "ad_title", "price"])
# y = train_data["price"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model = RandomForestRegressor(n_estimators=1000, random_state=42)

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')

In [96]:
# #Dla zbioru testowego
# X_test = test_data.drop(columns=["id", "ad_title"])
# # Upewnij się, że kolumny w X_test są w tej samej kolejności co w X_train
# X_test = X_test[X_train.columns]

# y_test_pred = model.predict(X_test)

In [97]:
# submission = pd.DataFrame({
#     'ID': test_data.index + 1,
#     'TARGET': y_test_pred
# })
# print(submission.head())
# submission.to_csv('pzn-solution.csv', index=False)

In [None]:
# Importowanie niezbędnych bibliotek
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Przygotowanie danych
X = train_data.drop(columns=["id", "ad_title", "price"])  # Usuń kolumny niepotrzebne
y = train_data["price"]  # Cena jako etykieta

# Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tworzenie modelu XGBoost Regressor
model_xgb = xgb.XGBRegressor(n_estimators=3000, learning_rate=0.005, max_depth=7, min_child_weight=5, subsample=0.7, colsample_bytree=0.7, gamma=0.1, random_state=42)


# Trenowanie modelu
model_xgb.fit(X_train, y_train)

# Predykcja na zbiorze testowym
y_pred_xgb = model_xgb.predict(X_test)

# Obliczanie MSE
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'Mean Squared Error XGBoost Regressor: {mse_xgb}')


Mean Squared Error XGBoost Regressor: 87743.15190173253


In [111]:
# Przygotowanie zbioru testowego
X_test_final = test_data.drop(columns=["id", "ad_title"])  # Usuń kolumny niepotrzebne
X_test_final = X_test_final[X_train.columns]  # Upewnij się, że kolumny są w tej samej kolejności co w X_train

# Predykcja na zbiorze testowym
y_test_predictions = model_xgb.predict(X_test_final)

submission = pd.DataFrame({
    'ID': test_data.index + 1,
    'TARGET': y_test_predictions
})

print(submission.head())
submission.to_csv('pzn-solution2.csv', index=False)


   ID       TARGET
0   1  1883.428223
1   2  1222.000122
2   3  1338.853394
3   4  1699.945435
4   5  3130.990967
