### Web Scraping using Beautiful Soup

Objective : DonorsChoose.org is an organization which collects funds from people to help public schools in America. Below we have scraped the https://www.niche.com/ website to get more information about schools, which will help in enhancing our DonorsChoose datasets to get more insights.

#### Import libraries

In [0]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import re

#### Get Data from the URL using BeautifulSoup

In [0]:
def getData(url):

  #we use the html parser to parse the url content and store it in a variable.
  textContent = []
  validate = lambda x : x.text if x is not None else np.nan

  for i in range(331):      # Number of pages plus one 
    url = url.format(i+1)
    with urllib.request.urlopen(url) as response:
      page = response.read()

    # parse the html using beautiful soup and store in variable `soup`
    soup = BeautifulSoup(page, 'html.parser')

    #Get the Data from the html
    mydivs = soup.findAll("li", {"class": "search-results__list__item"})
    for li in mydivs:
      if(li != None):
        unwanted = li.findAll(True, {"class":["search-result--featured","illustrated-cta","ad-spot", "ad-spot--mobile", "ad-spot--inactive", "search-related-lists"]})
        if (len(unwanted) > 0):
          continue
        name = li.find("h2",{"class":"search-result__title"})
        schoolName = validate(name)
        rank = li.find("div",{"class":"search-result-badge"})
        schoolRank = validate(rank)
        #print(schoolRank)
        tagline = li.find("ul", {"class":"search-result-tagline"})
        if(tagline !=None):
          ratingSpan = tagline.find("span", {"class":"review__stars__icon"})
          rating = (int)(ratingSpan['class'][1][-2:])/10 if (ratingSpan !=None) else np.nan 
          #print(rating)
          reviewCountSpan = tagline.find("span", {"class":"review__stars__number__reviews"})
          reviewCount = validate(reviewCountSpan)
        district = tagline.find(string=re.compile("[A-Z]$"))
        schoolDistrict = district if (district !=None) else np.nan
        #print(schoolDistrict)
        level = tagline.find(string=re.compile("[A-Z0-9][-]{1}[0-9]"))
        schoolGradeLevel = level if (level !=None) else np.nan
        #print(schoolGradeLevel)
        schoolGrades = li.findAll("li",{"class":"search-result-fact-list__item"})
        grade = li.find("ul",{"class":"search-result-fact-list"}).find("div",{"class":"niche__grade"})
        schoolGrade = validate(grade)
        studentCount = schoolGrades[1].find("span",{"class":"search-result-fact__value"})
        schoolStrength = validate(studentCount)
        stRatio = schoolGrades[2].find("span",{"class":"search-result-fact__value"})
        studentTeacherRatio = validate(stRatio)
        #print(studentTeacherRatio)

        textContent.append((schoolName, schoolRank, schoolDistrict, schoolGradeLevel, rating, reviewCount, schoolGrade, schoolStrength, studentTeacherRatio))
    
  return textContent


In [0]:
url = "https://www.niche.com/k12/search/best-schools/s/texas/?type=traditional&type=charter&type=magnet&gradeLevel=pk&gradeLevel=elementary&gradeLevel=middle&gradeLevel=high&page={}"
textContent = getData(url)

In [13]:
print('\n'.join(map(str, textContent)))

('Liberal Arts & Science Academy', '#11 Best Public High Schools in America', 'Austin Independent School District, TX', '9-12', 4.0, '395', 'A+', '1,016', '16:1')
('School of Science & Engineering', '#23 Best Public High Schools in America', 'Dallas Independent School District, TX', '9-12', 4.0, '176', 'A+', '418', '17:1')
('School for the Talented & Gifted', '#27 Best Public High Schools in America', 'Dallas Independent School District, TX', '9-12', 4.0, '145', 'A+', '256', '15:1')
('Carnegie Vanguard High School', '#37 Best Public High Schools in America', 'Houston Independent School District, TX', '9-12', 4.0, '269', 'A+', '621', '19:1')
('Westlake High School', '#41 Best Public High Schools in America', 'Eanes Independent School District, TX', '9-12', 4.0, '429', 'A+', '2,584', '15:1')
('Carroll Senior High School', '#102 Best Public High Schools in America', 'Carroll Independent School District, TX', '11-12', 4.0, '412', 'A+', '1,299', '15:1')
('Seven Lakes High School', '#103 Bes

#### Create csv and write the data into it.

In [0]:
# open a csv file with append, so old data will not be erased
with open('schoolRating.csv', 'w') as csv_file:
  writer = csv.writer(csv_file)
  writer.writerow(['School Name', 'Rank in America','District', 'School Grade Level','Rating','User Reviews','Overall Niche Grade','School Strength','Student Teacher Ratio', 'Extracted Date'])
 # The for loop
  for schoolName, schoolRank,schoolDistrict, schoolGradeLevel ,rating,reviewCount,schoolGrade,schoolStrength,studentTeacherRatio in textContent:
    writer.writerow([schoolName, schoolRank, schoolDistrict, schoolGradeLevel,rating,reviewCount,schoolGrade,schoolStrength,studentTeacherRatio, datetime.now()])

#### Read the created dataset

In [15]:
#Read csv
df_itemlist = pd.read_csv('schoolRating.csv')
df_itemlist.shape

(8275, 10)

In [16]:
df_itemlist.head()

Unnamed: 0,School Name,Rank in America,District,School Grade Level,Rating,User Reviews,Overall Niche Grade,School Strength,Student Teacher Ratio,Extracted Date
0,Liberal Arts & Science Academy,#11 Best Public High Schools in America,"Austin Independent School District, TX",9-12,4.0,395,A+,1016,16:1,2019-03-14 17:22:15.177727
1,School of Science & Engineering,#23 Best Public High Schools in America,"Dallas Independent School District, TX",9-12,4.0,176,A+,418,17:1,2019-03-14 17:22:15.177860
2,School for the Talented & Gifted,#27 Best Public High Schools in America,"Dallas Independent School District, TX",9-12,4.0,145,A+,256,15:1,2019-03-14 17:22:15.177880
3,Carnegie Vanguard High School,#37 Best Public High Schools in America,"Houston Independent School District, TX",9-12,4.0,269,A+,621,19:1,2019-03-14 17:22:15.177924
4,Westlake High School,#41 Best Public High Schools in America,"Eanes Independent School District, TX",9-12,4.0,429,A+,2584,15:1,2019-03-14 17:22:15.177937
