In [1]:
"""
Initial setup:
1) create virtualenv project using pycharm

2) install the following libraries to virtualenv:
pip install numpy
pip install pandas
pip install requests
pip install beautifulsoup4
pip install lxml
"""

'\nInitial setup:\n1) create virtualenv project using pycharm\n\n2) install the following libraries to virtualenv:\npip install numpy\npip install pandas\npip install requests\npip install beautifulsoup4\npip install lxml\n'

In [2]:
import requests  # library to send requests to web site(krisha.kz)
from bs4 import BeautifulSoup as bs  # library to copy all html-code
import csv  # library to write info to csv
import pandas as pd  # to convert csv to pandas DataFrame
import numpy as np  # to work np. arrays
import time
import math
import re

In [6]:
headers = {
    'accept': '*/*',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}

items_per_page = 12

def get_all_pages(catalog_page_url):
    urls = []
    urls.append(catalog_page_url)
    session = requests.Session()
    request = session.get(catalog_page_url, headers=headers)

    if request.status_code == 200:
        soup = bs(request.content, 'lxml')
        try:
            pagination = soup.select('li.tree__item._expanded._active')
            total_items = int(re.sub('\D', '', pagination[0].text))
            pages = math.ceil(total_items / items_per_page)
            for i in range(2, pages):
                url = catalog_page_url + '?page={i}'.format(i=i)
                if url not in urls:
                    urls.append(url)
        except Exception as e:
            print("exception while getting all page urls: " + e)
            pass
    return urls

def parse_kaspi_pages(urls):
    items = []
    session = requests.Session()
    for url in urls:
        time.sleep(1)
        print("parsing: " + url)

        request = session.get(url, headers=headers)
        soup = bs(request.content, 'lxml')
        divs = soup.find_all('div', attrs={'class': 'item-card ddl_product'})
        for div in divs:
            title = div.find('a', attrs={'class': 'item-card__name ddl_product_link'}).string.strip()
            price = div.find('span', attrs={'class': 'item-card__prices-price'}).string.strip()
            rating_div = div.find('div', attrs={'class': 'item-card__rating'})
            rating_element = rating_div.find('a', attrs={'class': 'ddl_product_link'})
            rating = ""
            if rating_element is not None:
                rating = re.search(r'\d+', str(rating_element.string)).group()

            items.append({
                'title': title,
                'price': price,
                'rating': rating,
            })
    print(len(items))
    return items


def files_writer(flats):
    # with open('HeadHunter.csv', 'a', encoding='utf-8') as file:
    with open(r"Kaspi.csv", "w", encoding='utf-8') as file:
        a_pen = csv.writer(file)
        a_pen.writerow(('title', 'price', 'rating'))
        for flat in flats:
            a_pen.writerow((flat['title'], flat['price'], flat['rating']))

base_url = 'https://kaspi.kz/shop/c/notebooks/'
urls = get_all_pages(base_url)
items = parse_kaspi_pages(urls)
files_writer(items)

parsing: https://kaspi.kz/shop/c/notebooks/
parsing: https://kaspi.kz/shop/c/notebooks/?page=2
parsing: https://kaspi.kz/shop/c/notebooks/?page=3
parsing: https://kaspi.kz/shop/c/notebooks/?page=4
parsing: https://kaspi.kz/shop/c/notebooks/?page=5
parsing: https://kaspi.kz/shop/c/notebooks/?page=6
parsing: https://kaspi.kz/shop/c/notebooks/?page=7
parsing: https://kaspi.kz/shop/c/notebooks/?page=8
parsing: https://kaspi.kz/shop/c/notebooks/?page=9
parsing: https://kaspi.kz/shop/c/notebooks/?page=10
parsing: https://kaspi.kz/shop/c/notebooks/?page=11
parsing: https://kaspi.kz/shop/c/notebooks/?page=12
parsing: https://kaspi.kz/shop/c/notebooks/?page=13
parsing: https://kaspi.kz/shop/c/notebooks/?page=14
parsing: https://kaspi.kz/shop/c/notebooks/?page=15
parsing: https://kaspi.kz/shop/c/notebooks/?page=16
parsing: https://kaspi.kz/shop/c/notebooks/?page=17
parsing: https://kaspi.kz/shop/c/notebooks/?page=18
parsing: https://kaspi.kz/shop/c/notebooks/?page=19
parsing: https://kaspi.kz/sh

In [10]:
df = pd.read_csv('Kaspi.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
new = df["title"].str.split(",", expand = True)
df["rooms"]= new[0]
df["square"]= new[1] 
#df["floor"]=new[2]
df.drop(columns =["title"], inplace = True)

new = df["address"].str.split(",", expand = True)
df["district"]= new[0]
df["street"]= new[1]
df.drop(columns =["address"], inplace = True)

new = df["district"].str.split(expand = True)
df["district"] = new[0]

new = df["square"].str.split(expand = True)
df["square"] = new[0]
df['square']=pd.to_numeric(df['square'])

df.drop(columns =["href"], inplace = True)

new = df["price"].str.split(expand = True)
new = new[0]+new[1]
df["price"] = new
df['price']=pd.to_numeric(df['price'])
df.loc[df.price <= 10000,'price'] = df.price * 1000

new = df['rooms'].str.split('-', expand = True)
df['rooms'] = new[0]

df.drop(columns =["street"], inplace = True)

#new = df['floor'].str.split('/', expand = True)
#df['floor'] = new[0]
#new = new.dropna()
#new = new[1].str.split(' ',expand = True)
#df['total floors'] = new[0]

df = df.dropna()

districts = ['Алмалинский','Бостандыкский','Ауэзовский','Медеуский','Жетысуский','Наурызбайский','Алатауский','Турксибский']
df = df[df['district'].isin(districts)]

year = list(df['content'])
yeardf=[]
for years in year:
    years = years[years.find("г.п.")-5:years.find("г.п.")-1]
    yeardf.append(years)
df['year'] = yeardf

df.drop(columns = ["content"], inplace = True)

df['yearisnull'] = df['year'].str.find('    ')
df = df.loc[df.yearisnull != 0]

df.drop(columns =["yearisnull"], inplace = True)

df['rooms']=pd.to_numeric(df['rooms'])
#df = df.loc[df.rooms!=11]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
df1 = df.groupby(['district']).size().reset_index(name='count')
df1.sort_values(by=['count'], inplace=True)
df1

In [None]:
df1.set_index('district', inplace=True)

In [None]:
df1.plot(kind='barh', 
             stacked=False,
             figsize=(20, 10),
             color='g',
             label=df1['count']
             )

plt.title('Распределение арендного жилья в городе Алматы')
plt.xlabel('Количество квартир')
plt.ylabel('Районы')
plt.show()

In [None]:
df.plot(kind='scatter', x='price', y='square', figsize=(12, 9), color='darkblue')

plt.title('Зависимость стоимости аренды от площади квартиры')
plt.xlabel('Стоимость аренды')
plt.ylabel('Площадь')

x = df['price']
y = df['square']
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')

plt.show()

In [None]:
df2 = df.loc[df.price <= 200000]

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df2.plot(kind='scatter', x='price', y='square', figsize=(12, 9), color='darkblue')

plt.title('Зависимость стоимости аренды от площади квартиры')
plt.xlabel('Стоимость аренды')
plt.ylabel('Площадь')

x = df2['price']
y = df2['square']
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')

plt.show()

In [None]:
df4 = df.groupby(['district']).sum()
#df4.sort_values(by=['count'], inplace=True)
df4

In [None]:
n_groups = 8

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35

rects1 = plt.barh(index + bar_width, df4['owner'], bar_width,
color='b',
label='Хозяин')

rects2 = plt.barh(index, df4['specialist'], bar_width,
color='g',
label='Риэлтор')

plt.ylabel('Районы')
plt.xlabel('Количество квартир')
plt.title('Распределение арендного жилья в городе Алматы')
plt.yticks(index + bar_width/2, df4.index)
plt.legend()

In [None]:
df.head()

In [None]:
df5 = df.groupby(['rooms']).size().reset_index(name='count')
df5.sort_values(by=['count'], inplace=True)
df5.set_index('rooms', inplace=True)
df5

In [None]:
df5.plot(kind='barh', 
             stacked=False,
             figsize=(20, 10),
             color='g',
             label=df5['count']
             )

plt.title('Распределение арендного жилья в городе Алматы')
plt.xlabel('Количество квартир')
plt.ylabel('Районы')
plt.show()

In [None]:
df6 = df.groupby(['district','price']).size().reset_index(name='count')
df6.sort_values(by=['count'], inplace=True)
df6 = df6.pivot(index='price',columns='district',values='count')

In [None]:
df6