# Goal
Scrapping information about smart phones

useful for:
- creating phone price predictor
- recommender system -> recommend similar phones based on one chosen

# Problem - using beautiful soup

some websites are dynamic -> to always click on load more or scroll to get information about more phones

- beautiful soup does not have any function for this

why?

- when load more is clicked or webpage is scrolled further -> it sends request to server and fetches additional data
- but request module only gets html file once in the beginning when website is loaded


# solution

- interact with dynamic websites using a tool

on eg. selenium

# setup steps

1. install seleinum
2. automate chrome browser

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
with open('smartprix.html',encoding='utf-8') as f:
  html=f.read()

In [None]:
html 

Output hidden; open in https://colab.research.google.com to view.

In [None]:
soup=BeautifulSoup(html,'lxml')

In [None]:
containers=soup.find_all('div',{'class':'sm-product has-tag has-features has-actions'})

In [None]:
name=[]
price=[]
ratings=[]
sim=[]
processor=[]
ram=[]
battery=[]
display=[]
camera=[]
memory_card=[]
os=[]

for i in containers:
  name.append(i.find('h2').text.strip())
  price.append(i.find('span',{'class':'price'}).text.strip())
  ratings.append(i.find('div',{'class':'score'}).find('b').text)

  x=i.find('ul',{'class':'specs'}).find_all('li')

  flag={'sim':0,'processor':0,'ram':0,'battery':0,'display':0,'camera':0,'memory_card':0,'os':0}
  for j in x:
    text=j.text.strip() 
    if 'Sim' in text:
      sim.append(text) 
      flag['sim']=1
    elif 'Processor' in text:
      processor.append(text)
      flag['processor']=1
    elif 'RAM' in text:
      ram.append(text)
      flag['ram']=1
    elif 'Battery' in text:
      battery.append(text)
      flag['battery']=1
    elif 'Display' in text:  
      if flag['display']==1:
        display[-1]=display[-1]+', '+text
      else:
        display.append(text)
        flag['display']=1 
    elif 'Camera' in text:
      camera.append(text)
      flag['camera']=1
    elif 'Memory Card' in text:
      memory_card.append(text)
      flag['memory_card']=1
    elif ('iOS' in text) or ('Android' in text):
      os.append(text)
      flag['os']=1 
       
  # assign nan value if data does not exist
  if flag['sim']==0:
    sim.append(np.nan)
  if flag['processor']==0:
    processor.append(np.nan) 
  if flag['ram']==0:
    ram.append(np.nan) 
  if flag['battery']==0:
    battery.append(np.nan) 
  if flag['display']==0:
    battery.append(np.nan) 
  if flag['camera']==0:
    camera.append(np.nan) 
  if flag['memory_card']==0:
    memory_card.append(np.nan) 
  if flag['os']==0:
    os.append(np.nan) 

# creating a dataframe
df=pd.DataFrame({
    'model':name,
    'price':price,
    'ratings':ratings,
    'sim':sim,
    'processor':processor,
    'ram':ram,
    'battery':battery,
    'display':display,
    'camera':camera,
    'memory_card':memory_card,
    'os':os
})

In [None]:
len(os)

1020

In [None]:
df

Unnamed: 0,model,price,ratings,sim,processor,ram,battery,display,camera,memory_card,os
0,OPPO Find N2 Flip,"₹89,999",88,"Single Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Dimensity 9000 Plus, Octa Core, 3.2 GHz Processor","8 GB RAM, 256 GB inbuilt",4300 mAh Battery with 44W Fast Charging,"6.8 inches, 1080 x 2520 px, 120 Hz Display wit...",50 MP + 8 MP Dual Rear & 32 MP Front Camera,Memory Card Not Supported,
1,Vivo V27,"₹32,999",85,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Dimensity 7200, Octa Core, 2.8 GHz Processor","8 GB RAM, 128 GB inbuilt",4600 mAh Battery with 66W Fast Charging,"6.78 inches, 1080 x 2400 px, 120 Hz Display wi...",50 MP + 8 MP + 2 MP Triple Rear & 50 MP Front ...,,Android v13
2,OnePlus 11R,"₹39,999",85,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Snapdragon 8+ Gen1, Octa Core, 3.2 GHz Processor","8 GB RAM, 128 GB inbuilt",5000 mAh Battery with 100W Fast Charging,"6.74 inches, 1240 x 2772 px, 120 Hz Display wi...",50 MP + 8 MP + 2 MP Triple Rear & 16 MP Front ...,,Android v13
3,Motorola Moto G73,"₹18,999",84,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC","Dimensity 930, Octa Core, 2.2 GHz Processor","8 GB RAM, 128 GB inbuilt",5000 mAh Battery with 30W Fast Charging,"6.5 inches, 1080 x 2400 px, 120 Hz Display wit...",50 MP + 8 MP Dual Rear & 16 MP Front Camera,"Memory Card (Hybrid), upto 1 TB",Android v13
4,Vivo V27 Pro,"₹37,999",84,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi","Dimensity 8200, Octa Core, 3.1 GHz Processor","8 GB RAM, 128 GB inbuilt",4600 mAh Battery with 66W Fast Charging,"6.78 inches, 1080 x 2400 px, 120 Hz Display wi...",50 MP + 8 MP + 2 MP Triple Rear & 50 MP Front ...,,Android v13
...,...,...,...,...,...,...,...,...,...,...,...
1015,Xiaomi Redmi 9i (4GB RAM + 128GB),"₹9,299",67,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","Helio G25, Octa Core, 2 GHz Processor","4 GB RAM, 128 GB inbuilt",5000 mAh Battery with 10W Fast Charging,"6.53 inches, 720 x 1600 px Display with Water ...",13 MP Rear & 5 MP Front Camera,"Memory Card Supported, upto 512 GB",Android v10
1016,Realme Narzo 20 Pro,"₹14,960",78,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","Helio G95, Octa Core, 2.05 GHz Processor","6 GB RAM, 64 GB inbuilt",4500 mAh Battery with 65W Fast Charging,"6.5 inches, 1080 x 2400 px, 90 Hz Display with...",48 MP Quad Rear & 16 MP Front Camera,"Memory Card Supported, upto 256 GB",Android v10
1017,BlackZone S3,"₹1,159",14,Dual Sim,512 MHz Processor,"64 MB RAM, 64 MB inbuilt",3000 mAh Battery,"2.4 inches, 320 x 240 px Display",0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB",
1018,Xiaomi Redmi 9A (3GB RAM + 32GB),"₹6,999",62,"Dual Sim, 3G, 4G, VoLTE, Wi-Fi","Helio G25, Octa Core, 2 GHz Processor","3 GB RAM, 32 GB inbuilt",5000 mAh Battery,"6.53 inches, 720 x 1600 px Display with Water ...",13 MP Rear & 5 MP Front Camera,"Memory Card Supported, upto 512 GB",Android v10


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        1020 non-null   object
 1   price        1020 non-null   object
 2   ratings      1020 non-null   object
 3   sim          1020 non-null   object
 4   processor    903 non-null    object
 5   ram          993 non-null    object
 6   battery      1008 non-null   object
 7   display      1020 non-null   object
 8   camera       1019 non-null   object
 9   memory_card  821 non-null    object
 10  os           821 non-null    object
dtypes: object(11)
memory usage: 87.8+ KB


In [None]:
df.to_csv('smartphones.csv')