In [39]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

### 1. Loading the Dataset

In [40]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,id,name,price,rating,img_link,category
0,49849504,"Kartepe, Turkey","₹8,078 per night",4.71,https://a0.muscache.com/im/pictures/cf7f3f57-8...,A-frames
1,50891766,"Kaş, Turkey","₹4,665 per night",New,https://a0.muscache.com/im/pictures/449c8751-0...,A-frames
2,50699164,"Imereti, Georgia","₹5,991 per night",4.85,https://a0.muscache.com/im/pictures/miso/Hosti...,A-frames
3,49871422,"Sapanca, Turkey","₹11,339 per night",5.0,https://a0.muscache.com/im/pictures/72e6396e-e...,A-frames
4,51245886,"Sapanca, Turkey","₹6,673 per night",New,https://a0.muscache.com/im/pictures/73973308-e...,A-frames


### 2. Scraping the data in CSV Format

In [42]:
data = []
prefix = 'https://www.airbnb.co.in/rooms/'

for id_ in tqdm(df['id']):
    
    link = prefix + str(id_)
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'html.parser')

    try:
        name    = soup.find('h1').text.strip()
    except:
        name    = np.nan

    try:
        rating  = soup.find_all('span', class_ = '_1jvg42j')[0].find_all('span')[-2].text.strip().split(' ')[0]
    except:
        rating  = np.nan

    try:
        reviews = soup.find_all('span', class_ = '_1jvg42j')[0].find_all('span')[-1].find('button').text.strip().split(' ')[0]
    except:
        reviews = np.nan

    try:
        address = soup.find_all('span', class_ = '_1jvg42j')[-1].text.strip()
    except:
        address = np.nan

    try:
        img_links = ' '.join([sp.get('src') for sp in soup.find('div', class_ = '_88xxct').find_all('img')])
    except:
        img_links = np.nan


    try:
        host_name = soup.find('h2').text.strip()
    except:
        host_name = np.nan


    try:
        features = []
        for sp in soup.find('ol', class_ = 'lgx66tx dir dir-ltr').find_all('li', class_ = 'l7n4lsf dir dir-ltr'):
            for i in sp.text.strip().split('·'):
                if (i != ''):
                    features.append(i.strip())
        features = ','.join(features)
    except:
        features = np.nan

    try:
        host_id = soup.find('div', class_ = 'c6y5den dir dir-ltr').find('a').get('href').split('/')[-1]    
    except:
        host_id = np.nan

    try:    
        house_rules  = ','.join([sp.text for sp in soup.find_all('div', class_ = 'cihcm8w dir dir-ltr')[0].find_all('span')])
        safety_rules = ','.join([sp.text for sp in soup.find_all('div', class_ = 'cihcm8w dir dir-ltr')[1].find_all('span')])
    except:
        house_rules  = np.nan
        safety_rules = np.nan

    try:
        amenities    = ','.join([sp.text for sp in soup.find('div', class_ = '_1byskwn').find_all('div', class_ = 'iikjzje i10xc1ab dir dir-ltr')])
    except:
        amenities    = np.nan

    data.append([id_ ,name, rating, reviews, host_name, host_id ,address, features, amenities, safety_rules, house_rules, img_links])
    
data = pd.DataFrame(data, columns = ['id', 'name','rating','reviews','host_name','host_id','address',
                                     'features','amenities','safety_rules','hourse_rules','img_links'])

100%|███████████████████████████████████| 14456/14456 [5:05:11<00:00,  1.27s/it]


### 3. Preprocessing the Data | Combining

In [43]:
data['price'] = df['price']
data['rating'] = df['rating']

### 4. Saving the CSV

In [44]:
data.to_csv('airbnb.csv', index = False)

### 5. Data Preprocessing

In [110]:
data = pd.read_csv('airbnb.csv')
data.isnull().sum()

id                 0
name               1
rating             0
reviews         2744
host_name          1
host_id          439
address           27
features          27
amenities        989
safety_rules     209
hourse_rules     209
img_links         27
price              0
dtype: int64

##### 5.1) Dealing with Reviews

In [111]:
data['reviews'] = data['reviews'].fillna(0)                 # Filled Values with 0

##### 5.2) Dealing with Safety and House Rules

In [112]:
data['safety_rules'] = data['safety_rules'].fillna('Not Available')    # Filled Values with Not Available
data['hourse_rules'] = data['hourse_rules'].fillna('Not Available')    # Filled Values with Not Available

##### 5.3) Dealing with Host_ID

In [113]:
data['host_id'] = data['host_id'].fillna('Not Available')              # Filled Values with Not Available

##### 5.4) Removing the remaining rows with null values

In [114]:
data = data.dropna()

##### 5.5) Removing Duplicate Stays

In [136]:
df_ = []

for i in data['id'].unique():
    df_.append(data[data['id'] == i].values[0])
    
df_ = pd.DataFrame(df_, columns = list(data.columns))

##### 5.6) Preprocessing Host-Name

In [164]:
host_name = []

for i in df_['host_name']:
    
    i = i.split('by')[-1].strip() 
    i = i.replace('\xa0',' ')
    
    host_name.append(i)
    
df_['host_name'] = host_name

##### 5.7) Creating Country Column

In [173]:
country = [i.split(',')[-1] for i in df_['address']]

df_['country'] = country

##### 5.8) Finding the Features

In [227]:
temp = []                        # Sepperating the Features
temp_ = []  
for i in df_['features']:
    temp += i.split(',')
    
                                 # Finding Feature Category
for i in set(temp):    
    if (i == ''):
        pass
    elif (i == 'Toilet with sink' or i == 'Shared toilet with sink' or i == 'Private toilet with sink'): # bathroom
        temp_.append('bathroom')
    elif (i =='Studio'):                     # studio
        temp.append('studio')
    else: 
        if ' '.join((i.split(' ')[1:])) == '':
            temp_.append(i)  
        else:
            temp_.append(' '.join((i.split(' ')[1:])))

In [299]:
# Creating the Lists for each Feature

bedrooms  = []
guests    = []
beds      = []
bathrooms = []
toilets   = []
studios   = []

for i in df_['features']:
     
    guest    = 0
    bathroom = 0
    bedroom  = 0
    bed      = 0
    toilet   = 0
    studio   = 0
    
    for feat in i.split(','):
        
        if ('guest' in feat):
            guest = int(float(feat.split(' ')[0].strip()))
            
        elif ('bath' in feat):
            bathroom = int(float(feat.split(' ')[0].strip()))
          
        elif ('bedroom' in feat):
            bedroom = int(float(feat.split(' ')[0].strip()))
            
        elif ('bed' in feat and 'bedroom' not in feat):
            bed = int(float(feat.split(' ')[0].strip()))
            
        elif ('Toilet' in feat or 'toilet' in feat):
            toilet = 1
  
        elif ('Studio' in feat):
            studio = 1
            
        else:
            print(feat)
            
        
    bedrooms.append(bedroom)
    beds.append(bed)
    guests.append(guest)
    bathrooms.append(bathroom)
    toilets.append(toilet)
    studios.append(studio)

In [301]:
df_['bathrooms'] = bathrooms
df_['beds']      = beds
df_['guests']    = guests
df_['toiles']    = toilets
df_['bedrooms']  = bedrooms
df_['studios']   = studios

df_.head()

Unnamed: 0,id,name,rating,reviews,host_name,host_id,address,features,amenities,safety_rules,hourse_rules,img_links,price,country,bathrooms,beds,guests,toiles,bedrooms,studios
0,49849504,Perla bungalov,4.71,64,Mehmetcan,357334205.0,"Kartepe, Kocaeli, Turkey","2 guests,2 bedrooms,1 bed,1 bathroom","Mountain view,Valley view,Lake access,Kitchen,...","󹀁,Airbnb's COVID-19 safety practices apply,󱠃,N...","Check-in: Flexible,Check out: 12:00 pm,Pets ar...",https://a0.muscache.com/im/pictures/a5da5cb7-c...,"₹8,078 per night",Turkey,1,1,2,0,2,0
1,50891766,Authentic Beach Architect Sheltered Villa with...,New,0,Fatih,386223873.0,"Kaş, Antalya, Turkey","4 guests,2 bedrooms,2 beds,2 bathrooms","Kitchen,Wifi,Dedicated workspace,Free parking ...","󹀁,Airbnb's COVID-19 safety practices apply,󱠆,C...","Check-in: 4:00 pm - 11:00 pm,Check out: 10:00 ...",https://a0.muscache.com/im/pictures/61b70855-2...,"₹4,665 per night",Turkey,2,2,4,0,2,0
2,50699164,cottages sataplia,4.85,68,Giorgi,409690853.0,"Imereti, Georgia","4 guests,1 bedroom,3 beds,1 bathroom","Mountain view,Kitchen,Wifi,Dedicated workspace...","󹀁,Airbnb's COVID-19 safety practices apply,󱠃,N...","Check-in: After 1:00 pm,Check out: 12:00 pm,Se...",https://a0.muscache.com/im/pictures/miso/Hosti...,"₹5,991 per night",Georgia,1,3,4,0,1,0
3,49871422,Sapanca Breathable Bungalow,5.0,13,Melih,401873242.0,"Sapanca, Sakarya, Turkey","4 guests,1 bedroom,2 beds,1 bathroom","Mountain view,Valley view,Kitchen,Wifi,Free pa...","󹀁,Airbnb's COVID-19 safety practices apply,󱠃,N...","Check-in: After 2:00 pm,Check out: 12:00 pm,No...",https://a0.muscache.com/im/pictures/72e6396e-e...,"₹11,339 per night",Turkey,1,2,4,0,1,0
4,51245886,Bungalov Ev 2,New,0,Arp Sapanca,414884116.0,"Sapanca, Sakarya, Turkey","2 guests,1 bedroom,1 bed,1 bathroom","Kitchen,Wifi,Free parking on premises,TV,Air c...","󹀁,Airbnb's COVID-19 safety practices apply,󱠆,C...","Check-in: After 2:00 pm,Check out: 12:00 pm,No...",https://a0.muscache.com/im/pictures/73973308-e...,"₹6,673 per night",Turkey,1,1,2,0,1,0


##### 5.9) Finding CheckIn Checkout Time

In [328]:
checkin  = []
checkout = []


for i in df_['hourse_rules']:
    
    in_time  = 'NA'
    out_time = 'NA'
    
    for feat in (i.split(',')):
        
        if 'Check-in' in feat:
            in_time = ' '.join(feat.split(':')[1:])
            
        elif 'Check out' in feat:
            out_time = ' '.join(feat.split(':')[1:])
            
            
    checkin.append(in_time)
    checkout.append(out_time)
    
df_['checkin']  = checkin
df_['checkout'] = checkout

##### 5.10) Dealing with Price

In [342]:
price = []

for i in df_['price']:       
    price.append(int(i.split(' ')[0][1:].replace(',','')))
    
df_['price'] = price

In [344]:
df_.to_csv('airbnb_v2.csv')