### import required librarist

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns' , None)

## Downloading Data

In [11]:
file_path = "housePrice.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [104]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (df.drop(columns = ['Price','Price(USD)']), df['Price'], test_size=0.2, random_state=0)

In [106]:
x_train.shape , x_test.shape 

((2783, 6), (696, 6))

### Missing value

In [18]:
nan_columns = [col for col in df.columns if df[col].isnull().sum() > 1 and df[col].dtypes=='O']

for col in nan_columns:
    print("{}:{}% missing values".format(col, np.round(df[col].isnull().mean(),4)))

Address:0.0066% missing values


In [20]:
def replace_cat_feature(df , nan_columns):
    data = df.copy()
    data.loc[:,nan_columns]= data[nan_columns].fillna('Missing')
    return data

df = replace_cat_feature(df,nan_columns)

df[nan_columns].isnull().sum()

Address    0
dtype: int64

In [22]:
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [108]:
df['Area']= pd.to_numeric(df['Area'], errors ='coerce')
df['Area'].fillna(df['Area'].median())

0       4.158883
1       4.110874
2       4.382027
3       4.564348
4       4.820282
          ...   
3474    4.465908
3475    4.430817
3476    4.330733
3477    4.663439
3478    4.418841
Name: Area, Length: 3479, dtype: float64

In [110]:
print(df.isnull().sum())
nan_numerical = [col for col in df.columns if df[col].isnull().sum() > 1 and df[col].dtypes!='O']

for col in nan_numerical:
    print("{}:{}% missing values".format(col, np.round(df[col].isnull().mean(),4)))

Area          6
Price         0
Price(USD)    0
Room          0
Parking       0
Warehouse     0
Elevator      0
Address       0
dtype: int64
Area:0.0017% missing values


### skewness

In [123]:
numerical_features = df.select_dtypes(include=['int64','float64']).columns
numerical_features = numerical_features.drop(['Address'], errors='ignore')
skewness = df[numerical_features].skew()
print(skewness)

Area          0.547783
Price        -0.111987
Price(USD)   -0.298397
Room          1.052171
Parking       0.000000
Warehouse     0.000000
Elevator      0.000000
dtype: float64


In [125]:
high_skew_columns = skewness[abs(skewness) > 0.5].index
print(f"{list(high_skew_columns)}")

['Area', 'Room']


In [127]:
import numpy as np

for column in numerical_features :
    df[column] = np.log1p(df[column])

In [129]:
df.head()

Unnamed: 0,Area,Price,Price(USD),Room,Parking,Warehouse,Elevator,Address
0,0.971052,1.412525,1.249146,,0.302394,0.236332,0.349646,-0.3783061417466838
1,0.967505,1.412525,1.249146,,0.302394,0.236332,0.349646,-0.3783061417466838
2,0.98696,1.398835,1.218194,0.016568,0.302394,0.236332,0.349646,-1.8668009021365424
3,0.9993,1.404539,1.231348,0.016568,0.302394,0.236332,0.349646,-1.7179514260975564
4,1.015719,1.426518,1.278788,0.016568,0.302394,0.236332,0.349646,1.7055865227991178


### category values

In [134]:
category_columns = [col for col in df.columns if df[col].dtypes=='O']

In [136]:
category_columns

['Address']

### handelling category columns

In [117]:
for column in category_columns :
    temp = df.groupby(column)['Price'].count()/len(df)
    temp_df=temp[temp>0.01].index
    df[column]=np.where(df[column].isin(temp_df), df[column], 'Other')

In [119]:
df.head(100)

Unnamed: 0,Area,Price,Price(USD),Room,Parking,Warehouse,Elevator,Address
0,4.158883,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.3783061417466838
1,4.110874,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.3783061417466838
2,4.382027,20.125429,9.816531,0.016846,0.423464,0.305512,0.51978,-1.8668009021365424
3,4.564348,20.620679,10.311760,0.016846,0.423464,0.305512,0.51978,-1.7179514260975564
4,4.820282,22.669176,12.360228,0.016846,0.423464,0.305512,0.51978,1.7055865227991178
...,...,...,...,...,...,...,...,...
95,4.948760,22.109560,11.800615,1.167800,0.423464,0.305512,0.51978,0.2170917624092595
96,4.709530,22.109560,11.800615,0.016846,0.423464,0.305512,0.51978,0.2170917624092595
97,4.262680,21.224041,10.915107,0.016846,0.423464,-3.273195,0.51978,0.2170917624092595
98,4.859812,22.625373,12.316425,1.167800,0.423464,0.305512,0.51978,0.2170917624092595


### converting categorical data to numerical

In [48]:
df['Parking'] = df['Parking'].replace({'True': True , 'False': False}).astype(bool).astype(int)
df['Warehouse'] = df['Warehouse'].replace({'True': True , 'False': False}).astype(bool).astype(int)
df['Elevator'] = df['Elevator'].replace({'True': True , 'False': False}).astype(bool).astype(int)

In [50]:
for category in category_columns:
    labels_ordered = df.groupby([category])['Price'].mean().sort_values().index
    labels_ordered = {k:i for i,k in enumerate(labels_ordered,0)}
    df[category] = df[category].map(labels_ordered)

In [52]:
df.head(10)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,4.158883,0.693147,1,1,1,11,21.338451,11.029515
1,4.110874,0.693147,1,1,1,11,21.338451,11.029515
2,4.382027,1.098612,1,1,1,1,20.125429,9.816531
3,4.564348,1.098612,1,1,1,2,20.620679,10.31176
4,4.820282,1.098612,1,1,1,25,22.669176,12.360228
5,4.26268,1.098612,1,1,0,15,21.441106,11.132168
6,4.477337,1.098612,1,1,1,1,20.21244,9.903538
7,4.094345,0.693147,1,1,1,11,21.488734,11.179795
8,4.007333,1.098612,1,1,0,4,20.01602,9.707128
9,4.276666,0.693147,1,1,1,10,21.586156,11.277216


In [54]:
scaling_feature = [col for col in df.columns if col not in [ 'Area','Price', 'Price(USD)']]
len(scaling_feature)

5

In [56]:
scaling_feature

['Room', 'Parking', 'Warehouse', 'Elevator', 'Address']

In [58]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(df[scaling_feature])

array([[-1.60533196,  0.42346432,  0.3055119 ,  0.51978033, -0.37830614],
       [-1.60533196,  0.42346432,  0.3055119 ,  0.51978033, -0.37830614],
       [ 0.01684632,  0.42346432,  0.3055119 ,  0.51978033, -1.8668009 ],
       ...,
       [ 0.01684632, -2.36147402, -3.27319488, -1.92388964, -2.01565038],
       [ 0.01684632,  0.42346432,  0.3055119 ,  0.51978033,  0.21709176],
       [ 0.01684632, -2.36147402,  0.3055119 ,  0.51978033, -2.01565038]])

In [60]:
data = pd.concat([df[['Area', 'Price', 'Price(USD)']].reset_index(drop = True),
                 pd.DataFrame(scaler.transform(df[scaling_feature]),columns = scaling_feature)],axis = 1)

In [62]:
data.head()

Unnamed: 0,Area,Price,Price(USD),Room,Parking,Warehouse,Elevator,Address
0,4.158883,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.378306
1,4.110874,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.378306
2,4.382027,20.125429,9.816531,0.016846,0.423464,0.305512,0.51978,-1.866801
3,4.564348,20.620679,10.31176,0.016846,0.423464,0.305512,0.51978,-1.717951
4,4.820282,22.669176,12.360228,0.016846,0.423464,0.305512,0.51978,1.705587


In [64]:
data.to_csv('X_train.csv',index = False)

In [66]:
df = pd.read_csv('X_train.csv')
df.head()

Unnamed: 0,Area,Price,Price(USD),Room,Parking,Warehouse,Elevator,Address
0,4.158883,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.378306
1,4.110874,21.338451,11.029515,-1.605332,0.423464,0.305512,0.51978,-0.378306
2,4.382027,20.125429,9.816531,0.016846,0.423464,0.305512,0.51978,-1.866801
3,4.564348,20.620679,10.31176,0.016846,0.423464,0.305512,0.51978,-1.717951
4,4.820282,22.669176,12.360228,0.016846,0.423464,0.305512,0.51978,1.705587


In [68]:
X = df.drop(columns=['Price', 'Price(USD)'])

y = df['Price']

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 0)

In [72]:
X_train.shape , X_test.shape

((2783, 6), (696, 6))

In [74]:
print(X_train.dtypes)
print(y_train.dtypes)

Area         float64
Room         float64
Parking      float64
Warehouse    float64
Elevator     float64
Address      float64
dtype: object
float64


In [76]:
Address_avg_Price = df.groupby('Address')['Price'].mean().sort_values().index
labels_ordered = {k: i for i, k in enumerate(Address_avg_Price, 0)}

X_train['Address'] = X_train['Address'].map(labels_ordered)
X_test['Address'] = X_test['Address'].map(labels_ordered)

In [78]:
X_train['Area'] = pd.to_numeric(X_train['Area'], errors='coerce')
X_test['Area'] = pd.to_numeric(X_test['Area'], errors='coerce')

In [80]:
print(X_train.isnull().sum())
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

Area         5
Room         0
Parking      0
Warehouse    0
Elevator     0
Address      0
dtype: int64


In [82]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

### Predict

In [85]:
y_pre = model.predict(X_test)

In [87]:
Address_avg_Price = df.groupby('Address')['Price'].mean().sort_values().index
labels_ordered = {k: i for i, k in enumerate(Address_avg_Price, 0)}
##df['Address_Num'] = df['Address'].map(labels_ordered)
reverse_labels = {v: k for k, v in labels_ordered.items()}
print("مقادیر عددی معادل هر منطقه")
for num, name in sorted(reverse_labels.items()):
    print(f"{name}: {num}")

مقادیر عددی معادل هر منطقه
-2.015650378175528: 0
-1.8668009021365424: 1
-1.7179514260975564: 2
-1.5691019500585703: 3
-1.4202524740195843: 4
-1.2714029979805987: 5
-1.122553521941613: 6
-0.9737040459026272: 7
-0.8248545698636413: 8
-0.6760050938246555: 9
-0.5271556177856697: 10
-0.3783061417466838: 11
-0.2294566657076979: 12
-0.0806071896687121: 13
0.0682422863702736: 14
0.2170917624092595: 15
0.3659412384482453: 16
0.5147907144872311: 17
0.663640190526217: 18
0.8124896665652028: 19
0.9613391426041886: 20
1.1101886186431744: 21
1.2590380946821604: 22
1.4078875707211462: 23
1.556737046760132: 24
1.7055865227991178: 25
1.854435998838104: 26
2.0032854748770896: 27


In [89]:
print("X_train:", X_train.columns.tolist())

X_train: ['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address']


In [91]:
print("new_house:", ['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address'])

new_house: ['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address']


In [99]:
new_house = pd.DataFrame(columns=X_train.columns, data = [[120, 2, 1, 1, 1, 20]])
predicted_Price = model.predict(new_house)
print(f"Predict Price : {predicted_Price[0]:,.0f}")

Predict Price : 145
