In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

#### Configuring pandas to display a large amount of data

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

#### Reading data

In [None]:
path = r"C:\Users\user\PycharmProjects\Kaggle\House Prices - Advanced Regression Techniques"
sample_submission = pd.read_csv( path + "\\sample_submission.csv" )
train = pd.read_csv( path + "\\train.csv", index_col="Id")

test = pd.read_csv( path + "\\test.csv", index_col="Id")

# EDA

#### Familiarization with the data

In [None]:
train.info()

In [None]:
train.describe()

#### Evaluation of the distribution of our data

In [None]:
train.SalePrice.plot(kind="hist",figsize=(10,5), bins=100, color='red', alpha=0.5);

#### Assessment of the distribution of changes in our value

In [None]:
train.SalePrice.diff().plot(kind="hist",figsize=(10,5), bins=100, color='blue', alpha=0.5);

#### Functions for EDA

In [None]:
def corelation(data, column_name= "SalePrice", percent = 0.7, maps = bool):
    correlation_map = data.corr()
    if maps == True:
        return correlation_map.style.background_gradient(cmap='Blues')
    else:
        return correlation_map.loc[(correlation_map[column_name] > percent),
                                   column_name]
    
cor = corelation(train,percent = 0 )
cor_map = corelation(train, maps = True, percent= .5)
cor

In [None]:
def missing_data(data):
    total_nan = data.isna().sum()
    percent_nan = (data.isnull().sum()/data.isnull().count() * 100)
    missing = pd.concat( [total_nan, percent_nan], axis=1, keys=['total', 'percent'])\
                            .sort_values(by="percent", ascending=False)
    return missing

mising = missing_data(train)
heavy_losses = mising[mising.percent > 20.]
heavy_losses

In [None]:
def uniques(data):
    df = data.copy()
    name_colu = df.select_dtypes("number").columns 
    unique_data = dict()
    for i in df.columns: 
        if i not in name_colu:
            unique_data[i] = df[i].unique()
    return unique_data
all_uniq = uniques(train)
all_uniq

In [None]:
def hot_cod(data_transform ,column_transform) -> pd.DataFrame:
    df = data_transform.copy()
    category_d = dict()
    
    for k,v in column_transform.items():
        v = {v[i]:i for i in range(len(v))} 
        df = df.replace({k:v})
    return df
all_data_encoded = hot_cod(train, all_uniq)
all_data_encoded

In [None]:
train = train.drop(heavy_losses.index , axis = 1)
train.shape

In [None]:
train_uniq = uniques(train)
train = hot_cod(train, train_uniq)
train.dtypes

In [None]:
cor_2 = corelation(train,percent= 0)
print(cor_2)
cor

In [None]:
train = train[cor_2.index]
print(train.shape)
train

# model preparation

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X = train.drop("SalePrice", axis=1)
y = train.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

train_dataset = Pool(X_train, y_train) 
test_dataset = Pool(X_test, y_test)

grid = {'iterations': [100, 150, 200],
        'learning_rate': [.01, .03, .05, .07, .09 ],
        'depth': list(range(2,6,1)),
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

model = CatBoostRegressor(loss_function="RMSE")



In [None]:
sarch_res = model.grid_search(grid, X, y= y,
                 cv= 5,
                 partition_random_seed= 0,
                 search_by_train_test_split = False,
                 refit = True,
                 stratified = False,
                 train_size = .3,
                 verbose = 10
                 )

In [None]:
model_2 = CatBoostRegressor(**sarch_res["params"])
model_2.fit(X,y)

# Predict

In [None]:
test = test.drop( heavy_losses.index, axis= 1)
test = test[cor_2.index[:-1]]
test = hot_cod(test,train_uniq)
test

In [None]:
predict = model_2.predict(test)
predict

In [None]:
rmse = mean_squared_error(sample_submission["SalePrice"], predict)**.5
r2 = r2_score(sample_submission["SalePrice"], predict)
# accuracy = accuracy_score(sample_submission["SalePrice"], predict)
print("Testing performance")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")
# print(f"accuracy: {accuracy}")

In [None]:
#sample_submission["SalePrice"] = predict
#sample_submission.to_csv(path_or_buf = r"/kaggle/working/'submission_hous_price.csv'", index = False)
#/kaggle/working/haus_price