# Importing Neccessary Library

In [2]:
from bs4 import BeautifulSoup
import requests

## Scratch the html from https://www.citypopulation.de/en/india/delhi/

In [4]:
url = "https://www.citypopulation.de/en/india/delhi/"

page = requests.get(url)
page

<Response [200]>

In [11]:
soup = BeautifulSoup(page.text, 'html')

## Columns

In [15]:
soup.find("tr", id="tsh").text

'Name\nStatusDistrictPopulationCensus1991-03-01PopulationCensus2001-03-01PopulationCensus2011-03-01\xa0'

In [22]:
columns = ["City", "Status", "District", "Population", "Area"]
columns

['City', 'Status', 'District', 'Population', 'Area']

## City Names

In [57]:
CityNames = soup.find_all("td", class_="rname")[10:]

In [58]:
CityNames = [i.text for i in CityNames]

In [59]:
CityNames[:10]

['Aali',
 'Ali Pur',
 'Asola',
 'Aya Nagar',
 'Babar Pur',
 'Bakhtawar Pur',
 'Bakkar Wala',
 'Bankauli',
 'Bankner',
 'Bapraula']

## Status

In [54]:
Status = soup.find_all("td", class_="rstatus")[10:]

In [55]:
Status = [i.text for i in Status]

In [56]:
Status[:10]

['Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town',
 'Census Town']

## District

In [156]:
District = soup.find_all("td", class_="radm")

In [157]:
District = [i.text for i in District]

In [158]:
District[:10]

['South',
 'North West',
 'South',
 'South',
 'North East',
 'North West',
 'West',
 'North West',
 'North West',
 'West']

## Population

In [60]:
Population = soup.find_all("td", class_="rpop prio1")[10:]

In [61]:
Population = [i.text for i in Population]

In [62]:
Population[:10]

['27,169',
 '20,332',
 '13,275',
 '33,123',
 '37,058',
 '12,716',
 '18,122',
 '5,339',
 '14,788',
 '52,744']

## Areas

In [142]:
Areas = soup.find_all("td", class_="rname")[10:]

In [143]:
Area1 = [row.attrs["data-area"] for row in Areas[:25]]
Area2 = [row.attrs["data-area"] for row in Areas[26:]]

In [144]:
Areas = Area1+Area2

## Filttering the dataset

In [152]:
len(Population), len(CityNames), len(District), len(Status), len(Areas)

(113, 113, 113, 113, 112)

In [153]:
Population = Population[:25] + Population[26:]
len(Population)

112

In [154]:
CityNames = CityNames[:25] + CityNames[26:]
len(CityNames)

112

In [159]:
District = District[:25] + District[26:]
len(District)

112

In [160]:
Status = Status[:25] + Status[26:]
len(Status)

112

# Develeping The Data Frame

In [161]:
import pandas as pd

In [167]:
# Create the DataFrame
df = pd.DataFrame({
    'City': CityNames,
    'Status': Status,
    'District': District,
    'Population': Population,
    'Area': Areas
})

df.head()

Unnamed: 0,City,Status,District,Population,Area
0,Aali,Census Town,South,27169,4.0
1,Ali Pur,Census Town,North West,20332,8.6
2,Asola,Census Town,South,13275,12.0
3,Aya Nagar,Census Town,South,33123,8.18
4,Babar Pur,Census Town,North East,37058,0.8


## Information about the dataset

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City        112 non-null    object
 1   Status      112 non-null    object
 2   District    112 non-null    object
 3   Population  112 non-null    object
 4   Area        112 non-null    object
dtypes: object(5)
memory usage: 4.5+ KB


## Filttering

In [170]:
df["Population"] = df["Population"].apply(lambda x: int(x.split(",")[0] + x.split(",")[1]))

In [174]:
df["Area"] = df["Area"].apply(lambda x: float(x))

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        112 non-null    object 
 1   Status      112 non-null    object 
 2   District    112 non-null    object 
 3   Population  112 non-null    int64  
 4   Area        112 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.5+ KB


## Feature Engineering

In [180]:
df.head()

Unnamed: 0,City,Status,District,Population,Area
0,Aali,Census Town,South,27169,4.0
1,Ali Pur,Census Town,North West,20332,8.6
2,Asola,Census Town,South,13275,12.0
3,Aya Nagar,Census Town,South,33123,8.18
4,Babar Pur,Census Town,North East,37058,0.8


In [183]:
df["Density"] = df["Population"]/df["Area"]

In [184]:
df.head()

Unnamed: 0,City,Status,District,Population,Area,Density
0,Aali,Census Town,South,27169,4.0,6792.25
1,Ali Pur,Census Town,North West,20332,8.6,2364.186047
2,Asola,Census Town,South,13275,12.0,1106.25
3,Aya Nagar,Census Town,South,33123,8.18,4049.266504
4,Babar Pur,Census Town,North East,37058,0.8,46322.5


## Save Dataset

In [186]:
df.to_csv("Delhi Population Dataset.csv", index=False)