# Data Collection

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd

In [2]:
# page which we want to parse
url = 'https://nout.kz/catalog/notebooks/filter/product_type-is-81366f69e544f64817c57cbb10961431/'
headers = {'user-agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

In [3]:
# finding number of pages
pagination = soup.find('div',attrs={'class':'blog-page-navigation'}).find_all('a')
pages = pagination[-2].text
print('Всего страниц: ' + pages)

Всего страниц: 136


In [4]:
# getting data from every page
notebooks = []
for page in range(1, int(pages)+1):
    response = requests.get(url, headers=headers, params={'PAGEN_1': page})
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('div', attrs = {'class':'catalog-list row'})
    print(f'Parsing of page {page} from {pages}...')
    for row in table.findAll('div',attrs = {'class':'catalog-list-element element-add-basket-ajax item'}):
        if len(row.find('div',attrs={'class':'description'}).findAll('div',attrs={'class':'param'})) == 8:
            notebook = {}
            notebook['name'] = row.h2.a.text
            notebook['brand'] = re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(3)").text).strip().replace('Бренд: ','')
            notebook['gigabytes'] = int(re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(4)").text).strip().replace('Жесткий диск, ГБ: ',''))
            notebook['disc-tech'] = re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(5)").text).strip().replace('Технология диска: ','')
            notebook['RAM'] = int((re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(6)").text).strip()).replace('RAM, Гб: ',''))
            notebook['screen_size'] = float((re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(7)").text).strip()).replace('Диагональ экрана, ": ',''))
            notebook['CPU'] = re.sub(r'[\t\n ]+', ' ',row.find('div', attrs = {'class':'description'}).select_one(":nth-child(8)").text).strip().replace('CPU: ','')
            notebook['price'] =  (row.find('div', attrs = {'class':'product-price'}).find('div', attrs = {'class':'real'}).text)
            notebooks.append(notebook)
        else:
            pass

Parsing of page 1 from 136...
Parsing of page 2 from 136...
Parsing of page 3 from 136...
Parsing of page 4 from 136...
Parsing of page 5 from 136...
Parsing of page 6 from 136...
Parsing of page 7 from 136...
Parsing of page 8 from 136...
Parsing of page 9 from 136...
Parsing of page 10 from 136...
Parsing of page 11 from 136...
Parsing of page 12 from 136...
Parsing of page 13 from 136...
Parsing of page 14 from 136...
Parsing of page 15 from 136...
Parsing of page 16 from 136...
Parsing of page 17 from 136...
Parsing of page 18 from 136...
Parsing of page 19 from 136...
Parsing of page 20 from 136...
Parsing of page 21 from 136...
Parsing of page 22 from 136...
Parsing of page 23 from 136...
Parsing of page 24 from 136...
Parsing of page 25 from 136...
Parsing of page 26 from 136...
Parsing of page 27 from 136...
Parsing of page 28 from 136...
Parsing of page 29 from 136...
Parsing of page 30 from 136...
Parsing of page 31 from 136...
Parsing of page 32 from 136...
Parsing of page 3

# Data Cleaning

In [5]:
#Creating dataframe and storing all values to it
df = pd.DataFrame.from_dict(notebooks)

In [6]:
df.head()

Unnamed: 0,name,brand,gigabytes,disc-tech,RAM,screen_size,CPU,price
0,Ноутбук Toshiba Dynabook R731/E,Toshiba,250,HDD,4,13.0,Core i3-2310M 2.1 ГГц,38 900 ₸
1,Ноутбук Toshiba Dynabook R731/E,Toshiba,250,HDD,4,13.0,Core i5-2520M 2.5 ГГц,46 900 ₸
2,Ноутбук Toshiba Dynabook R731,Toshiba,250,HDD,4,13.0,Core i3-2310M 2.1 ГГц,67 500 ₸
3,Ноутбук Toshiba Dynabook R731/B,Toshiba,250,HDD,4,13.0,Core i5-2520M 2.5 ГГц,78 900 ₸
4,Ноутбук Lenovo ThinkPad X260 256 SSD,Lenovo,256,SSD,16,12.5,Core i5-6300U 2.4 ГГц,79 900 ₸


Change column name

In [7]:
df.rename(columns = {'name':'laptop_name','gigabytes':'hard_disc_size','brand':'brand_name','price':'price_in_₸'}, inplace = True) 

In [8]:
df.shape

(1582, 8)

In [9]:
df.dtypes

laptop_name        object
brand_name         object
hard_disc_size      int64
disc-tech          object
RAM                 int64
screen_size       float64
CPU                object
price_in_₸         object
dtype: object

Necessary to change price column from object type into int type

In [11]:
df['price_in_₸'] = df['price_in_₸'].str.replace('₸','') #removing cuurency sign

In [12]:
df['price_in_₸'] = df['price_in_₸'].str.replace(' ','') #removing extra space

In [13]:
df['price_in_₸'] = df['price_in_₸'].apply(pd.to_numeric)

Removing rows with empty values

In [14]:
df.isnull().sum()

laptop_name        0
brand_name         0
hard_disc_size     0
disc-tech          0
RAM                0
screen_size        0
CPU                0
price_in_₸        59
dtype: int64

In [15]:
df_updated = df.dropna(axis=0)

In [16]:
df_updated.shape

(1523, 8)

Overall statistics

In [17]:
df_updated.describe()

Unnamed: 0,hard_disc_size,RAM,screen_size,price_in_₸
count,1523.0,1523.0,1523.0,1523.0
mean,490.592252,11.477347,15.024951,604220.9
std,277.642106,6.35771,1.084496,394103.7
min,128.0,4.0,11.6,38900.0
25%,256.0,8.0,14.0,344397.5
50%,512.0,8.0,15.6,513975.0
75%,512.0,16.0,15.6,728850.0
max,2048.0,64.0,18.0,3849800.0


Removing unnecessary word "Ноутбук" from 'laptop_name'

In [22]:
df_updated['laptop_name'] = df_updated['laptop_name'].str.replace('Ноутбук','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_updated['laptop_name'] = df_updated['laptop_name'].str.replace('Ноутбук','')


Saving dataframe to csv.file

In [24]:
df_updated.to_csv('laptops.csv',index=False)