<a href="https://colab.research.google.com/github/Datascientisit/ML_regression/blob/main/Tash_house_value_predicting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ustunlar ta'rifi
- `location` - sotilayotgan uy manzili
- `district` - uy joylashgan tuman
- `rooms` - xonalar soni
- `size` - uy maydoni (kv.m)
- `level` - uy joylashgan qavat
- `max_levels` - ja'mi qavatlar soni
- `price` - uy narxi

## Vazifani CRSIP-DM Metolodgiyasi yordamida bajaring.
<img src="https://i.imgur.com/dzZnnYi.png" alt="CRISP-DM" width="800"/>

*Biz uchun zarur kutibxonlar*

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['price'] = pd.to_numeric(df['price'], errors='coerce', downcast='integer')
df['size'] = pd.to_numeric(df['size'], errors='coerce', downcast='integer')

In [None]:
df['size'].fillna(df['size'].mean(), inplace=True)
df['price'].fillna(df['price'].mean(), inplace=True)

In [None]:
df.info()

In [None]:
df['price_cat'] = pd.qcut(df['price'], 5)
df['price_cat'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df['price_cat']):
    strat_train_set = df.iloc[train_index]
    strat_test_set = df.iloc[test_index]

In [None]:
strat_train_set.drop('price_cat', axis=1, inplace=True)
strat_test_set.drop('price_cat', axis=1, inplace=True)

In [None]:
df_train = strat_train_set.copy()
df_train_labels = df_train['price'].copy()
df_train.plot(kind='scatter', x='price', y='district', figsize=(7,5))
plt.show()

In [None]:
df_train.plot(
    kind='scatter',
    x='price',
    y='size',
    alpha=1,
    s=df_train['max_levels']/50,
    label='max_lebels',
    figsize=(7,5),
    c='price',
    cmap=plt.get_cmap('jet'),
    colorbar=True,
    )
plt.legend()
plt.show()

In [None]:
df_num = df_train.select_dtypes(include=['int64', 'float64'])
df_num.head()

In [None]:
sns.pairplot(df_num, height=5)
plt.show()

In [None]:
df_obj = df_train.select_dtypes(include='object')
df_obj.head()

In [None]:
df_obj['location'] = df_obj['location'].map(lambda x: x.split('город Ташкент, '))
df_obj['location']

In [None]:
def removeEl(lst):
    del lst[0]
    return lst

df_obj['location'] = df_obj['location'].map(removeEl)
df_obj['location']

In [None]:
df_obj.drop('location', axis=1, inplace=True)

In [None]:
df_obj['district'].value_counts()

In [None]:
df_encoded = pd.get_dummies(df_obj['district'])
df_encoded

In [None]:
df_encoded.replace({True:1, False:0}, inplace=True)
df_encoded

In [None]:
sum(df_num.index==df_encoded.index)

In [None]:
df_tr = pd.concat([df_num, df_encoded], axis=1)
df_tr

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_str = scaler.fit_transform(df_tr)
df_str

**ML model**

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(df_str, df_train_labels)

In [None]:
test_data = strat_test_set.drop('price', axis=1)
test_labels = strat_test_set['price'].copy()

In [None]:
test_data_encoded = pd.get_dummies(test_data['district'])
test_data_encoded.replace({True:1, False:0}, inplace=True)

In [None]:
test_data_num = test_data.select_dtypes(include=['int64', 'float64'])
test_data_num

In [None]:
test_data_tr = pd.concat([test_data_num, test_data_encoded], axis=1)
test_data_tr

In [None]:
for col in df_tr.columns:
    if col not in test_data_tr.columns:
        test_data_tr[col] = 0

test_data_tr = test_data_tr[df_tr.columns]

In [None]:
test_data_tr = scaler.fit_transform(test_data_tr)
test_data_tr

In [None]:
test_predictions = LR_model.predict(test_data_tr)

In [None]:
pd.DataFrame({'price': test_labels, 'predictions': test_predictions})

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(test_labels, test_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(test_labels, test_predictions)
print(lin_mae)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(df_str, df_train_labels)

In [None]:
predictions = RF_model.predict(test_data_tr)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(test_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

In [None]:
from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(test_labels, predictions)
print(lin_mae)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RF_model, df_str, df_train_labels, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())
display_scores(rmse_scores)