<a href="https://colab.research.google.com/github/EmilSeyfullayev/Home-prices-analytics/blob/main/Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
!pip install gsheetsdb

In [66]:
from gsheetsdb import connect

In [69]:
conn = connect()

In [80]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1J1Lw1vGLEIee31gMLiE8Oihy9LuWSKhvuDdf9qSEKMo/edit?usp=sharing'

In [81]:
df = pd.read_sql(f'SELECT * FROM "{sheet_url}"', conn)

In [82]:
df.shape

(16218, 26)

In [83]:
len(df.drop_duplicates(subset='ids')) # No duplicate ids

16218

# Analytics

## Preprocessing

In [84]:
checkpoint_1 = df.iloc[:, :]

In [95]:
df = checkpoint_1.iloc[:, :]

In [96]:
df.columns.values

array(['ids', 'vipped_featured', 'prices', 'texts', 'categories',
       'floors', 'areas_m2', 'rooms', 'documents', 'credits', 'address',
       'district', 'latitudes', 'longitudes', 'ownerships', 'names',
       'emails', 'phone_numbers_appened', 'ad_number', 'watches',
       'ad_refreshed_date', 'agency_titles', 'photo_counts',
       'phot_pseudo_links', 'date_of_parsing', 'deal_ended'], dtype=object)

In [97]:
# Necessary columns
necessary_columns = [
                     "ids",
                     "vipped_featured", 
                     "prices",
                     "categories",
                     "floors",
                     "areas_m2",
                     "rooms",
                     "documents",
                     "credits",
                     "address",
                     "district",
                     "latitudes",
                     "longitudes",
                     "ownerships",
                     "watches",
                     "ad_refreshed_date",
                     "agency_titles",
                     "date_of_parsing",
                     "deal_ended"
]
df = df[necessary_columns]

In [98]:
df.head()

Unnamed: 0,vipped_featured,prices,categories,floors,areas_m2,rooms,documents,credits,address,district,latitudes,longitudes,ownerships,watches,ad_refreshed_date,agency_titles,date_of_parsing,deal_ended
0,featuredvipped,54 000,Köhnə tikili,2 / 5,50 m²,2,var,var,"Ünvan: Bakı şəhəri, R.Şahsuvarov küç 31",Suraxanı r.; Hövsan q.,40.35582129,50.06357475,mülkiyyətçi,3363,01 Yanvar 2022,0,2022-01-03 11:36:55,0
1,vipped,289 000,Yeni tikili,5 / 17,180 m²,4,var,var,"Ünvan: Bakı şəhəri, Affiyədin Cəlilov küç.",Şah İsmayıl Xətai m.; Xətai r.,40.37891689,49.875682,mülkiyyətçi,7348,22 Dekabr 2021,0,2022-01-03 11:36:55,0
2,vipped,159 000,Yeni tikili,12 / 19,58 m²,2,var,var,"Ünvan: Bakı şəhəri, Azadlıq prospekti",ASAN Xidmət №1 ; TQDK ; Neftçi bazası ; Nəsimi r.,40.39841488,49.83920251,vasitəçi (agent),197,28 Dekabr 2021,0,2022-01-03 11:36:55,0
3,vipped,175 000,Yeni tikili,7 / 7,117 m²,3,var,var,"Ünvan: Bakı şəhəri, Xəqani Rüstəmov küçəsi 6",Əhmədli m.; Xətai r.,40.3816452,49.9584505,mülkiyyətçi,376,25 Dekabr 2021,0,2022-01-03 11:36:55,0
4,0,104 000,Yeni tikili,11 / 12,65 m²,2,var,xeyr,"Ünvan: Bakı şəhəri, S.S.Axundov küç.",Azadlıq Prospekti m.; Binəqədi r.; 8-ci mikror...,40.42705637,49.84432445,vasitəçi (agent),28,Bugün 15:20,New House 20 Yanvar daşınmaz əmlak agentliyi,2022-01-03 11:36:55,0


In [99]:
df.shape

(16218, 18)

### Vipped Featured ordinary ads

In [100]:
df['vipped_featured'].value_counts()

0                 15872
vipped              249
featuredvipped       77
featured             20
Name: vipped_featured, dtype: int64

In [101]:
ad_type = pd.get_dummies(df['vipped_featured'], prefix='ad_type', drop_first=True)
ad_type # ad_type_0 is reference category

Unnamed: 0,ad_type_featured,ad_type_featuredvipped,ad_type_vipped
0,0,1,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,0
...,...,...,...
16213,0,0,0
16214,0,0,0
16215,0,0,0
16216,0,0,0


In [102]:
df = pd.concat([df, ad_type], axis=1)
df.columns.values

array(['vipped_featured', 'prices', 'categories', 'floors', 'areas_m2',
       'rooms', 'documents', 'credits', 'address', 'district',
       'latitudes', 'longitudes', 'ownerships', 'watches',
       'ad_refreshed_date', 'agency_titles', 'date_of_parsing',
       'deal_ended', 'ad_type_featured', 'ad_type_featuredvipped',
       'ad_type_vipped'], dtype=object)

### Price

In [103]:
df['prices'] = pd.to_numeric(
    df['prices'].apply(lambda x: str(x).replace(" ", ""))
)

### Categories

In [104]:
df['categories'].value_counts()

Yeni tikili     10900
Köhnə tikili     5312
0                   6
Name: categories, dtype: int64

In [105]:
categories = pd.get_dummies(df['categories'], drop_first=True) # 0 is reference category
categories

Unnamed: 0,Köhnə tikili,Yeni tikili
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
16213,0,1
16214,0,1
16215,0,1
16216,0,1


In [106]:
df = pd.concat([df, categories], axis=1)
df.columns.values

array(['vipped_featured', 'prices', 'categories', 'floors', 'areas_m2',
       'rooms', 'documents', 'credits', 'address', 'district',
       'latitudes', 'longitudes', 'ownerships', 'watches',
       'ad_refreshed_date', 'agency_titles', 'date_of_parsing',
       'deal_ended', 'ad_type_featured', 'ad_type_featuredvipped',
       'ad_type_vipped', 'Köhnə tikili', 'Yeni tikili'], dtype=object)

### Floors

In [107]:
df['floors'].value_counts()

5 / 5      592
4 / 5      589
3 / 5      484
2 / 5      458
1 / 5      312
          ... 
23 / 26      1
22 / 27      1
7 / 26       1
9 / 24       1
15 / 28      1
Name: floors, Length: 341, dtype: int64

In [116]:
apartment_floor = df['floors'].apply(lambda x: str(x).split(" / ")[0])

buildings_floor = []

for i in df['floors']:
  try:
    buildings_floor.append(str(i).split(" / ")[1])
  except:
    buildings_floor.append(np.nan)

In [117]:
df['apartment_floor'] = apartment_floor
df['buildings_floor'] = buildings_floor

### Area m2

In [127]:
df['areas_m2'] = df['areas_m2'].apply(lambda x: float(x.rstrip(" m²")))

In [128]:
df.columns.values

array(['vipped_featured', 'prices', 'categories', 'floors', 'areas_m2',
       'rooms', 'documents', 'credits', 'address', 'district',
       'latitudes', 'longitudes', 'ownerships', 'watches',
       'ad_refreshed_date', 'agency_titles', 'date_of_parsing',
       'deal_ended', 'ad_type_featured', 'ad_type_featuredvipped',
       'ad_type_vipped', 'Köhnə tikili', 'Yeni tikili', 'apartment_floor',
       'buildings_floor'], dtype=object)