# Web Scraping

*italicized text*## Installing Selenium For Web Scraping

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install newspaper3k

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.2.0-py3-none-any.whl (983 kB)
[K     |████████████████████████████████| 983 kB 14.2 MB/s 
[?25hCollecting trio~=0.17
  Downloading trio-0.20.0-py3-none-any.whl (359 kB)
[K     |████████████████████████████████| 359 kB 46.2 MB/s 
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting urllib3[secure,socks]~=1.26
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 87.4 MB/s 
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)
Collecting pyOpenSSL>=0.14
  Downloading pyOpenSS

## Importing necessary libraries for web scraping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import numpy as np

from newspaper import Article
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import csv

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Web Crawler Initialization

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)

## URLs having load more button - Handling to scrape data

In [None]:
def getUrlData(url):
  driver.get(url)

  n = 500

  while n > 0:
    l = driver.find_element_by_id("loadMoreaddBtn")
    driver.execute_script("arguments[0].click();", l)
    n -= 1

  render = driver.page_source
  return render

## Getting data from Match Reports and Announcements URLs

In [None]:
r1 = getUrlData("https://www.iplt20.com/news/match-reports")
soupRep = BeautifulSoup(r1, 'html.parser')

  import sys


In [None]:
r2 = getUrlData("https://www.iplt20.com/news/announcements")
soupAnn = BeautifulSoup(r2, 'html.parser')

  import sys


## Scrapes list of URLs from the respective sites

In [None]:
report_url_list = []

div = soupRep.find('div', {'id':'div-match-report'})
for rep in div.findAll('div', {'class':'ap-common-news'}):
  link = rep.find('a')
  path = link.get('href')
  if path:
    report_url_list.append(path)

In [None]:
announce_url_list = []
div = soupAnn.find('div', {'id':'div-match-report'})
for rep in div.findAll('div', {'class':'ap-common-news'}):
  link = rep.find('a')
  path = link.get('href')
  if path:
    announce_url_list.append(path)

## Text Pre-Processing

In [None]:

def preprocess(text):
  text = text.rstrip().lstrip()
  text = text.lower()
  text = nltk.RegexpTokenizer(r'\w+').tokenize(text)

  lemma = WordNetLemmatizer()
  text = list(map(lemma.lemmatize, text))

  stop_words = set(stopwords.words('english'))
  text = [w for w in text if not w.lower() in stop_words]

  return text

## Storing the data scraped in CSV file

In [None]:
fields = ['url', 'title', 'text', 'class']

with open('/content/drive/MyDrive/IR Package/data/news.csv', 'w') as csvFile:
  writer = csv.DictWriter(csvFile, fieldnames = fields)
  writer.writeheader()

  urls = []

  for url in report_url_list:
    if url in urls:
      continue

    urls.append(url)

    a = Article(url)
    a.download()
    a.parse()
    text = ' '.join(preprocess(a.text))
    title = a.title
    writer.writerow({
        'url': url,
        'title': title,
        'text': text,
        'class': 'report'
    })


  for url in announce_url_list:
    if url in urls:
      continue

    urls.append(url)

    a = Article(url)
    a.download()
    a.parse()
    text = ' '.join(preprocess(a.text))
    title = a.title
    writer.writerow({
        'url': url,
        'title': title,
        'text': text,
        'class': 'announcement'
    })

## Scraping data for IPL match details

In [None]:
match = []
i = 1
while(True):
  url = 'http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=' + f'{i:04}'

  print(url)

  driver.get(url)
  render = driver.page_source

  if 'No Record Found' in str(BeautifulSoup(render, "lxml").text):
    break

  soup = BeautifulSoup(render, 'html.parser')
  html = str(soup.findAll('table')[2])

  html = BeautifulSoup(html, "lxml").text
  html = " ".join(html.split())
  if html.find("Match List IPL Records and Statistics Menu") != -1:
    html = html[:html.index("Match List IPL Records and Statistics Menu")]
  match.append(html)
  i += 1

## Storing Match Details in CSV file

In [None]:
fields = ['url', 'title', 'text', 'class']

with open('/content/drive/MyDrive/IR Package/data/news.csv', 'a') as csvFile:
  writer = csv.DictWriter(csvFile, fieldnames = fields)
  for i in range(len(match)):

    writer.writerow({
        'url': 'http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=' + f'{(i+1):04}',
        'title': 'Howstat IPL match ID ' + str(i+1),
        'text': ' '.join(preprocess(match[i])),
        'class': 'scorecard'
    })

## Scraping data for Player profiles

In [None]:
url = 'http://www.howstat.com/cricket/Statistics/IPL/PlayerList.asp'
driver.get(url)
render = driver.page_source
soup = BeautifulSoup(render, 'html.parser')
table = soup.findAll('table')[2]
pp = []
for l in soup.findAll('a',{'class': "LinkTable"}):
  u = 'http://www.howstat.com/cricket/Statistics/IPL/' + l.get('href')

  driver.get(u)
  render = driver.page_source

  if 'No Record Found' in str(BeautifulSoup(render, "lxml").text):
    break

  soup = BeautifulSoup(render, 'html.parser')
  html = str(soup.findAll('table')[2])

  html = BeautifulSoup(html, "lxml").text
  html = " ".join(html.split())

  if html.find("Match List IPL Records and Statistics Menu") != -1:
    html = html[:html.index("List IPL Records and Statistics Menu")]

  pp.append({
      'url': u,
      'title': str(l.get_text()).lstrip().rstrip(),
      'text': ' '.join(preprocess(html)),
      'class': 'profile'
  })

## Storing Player Data in CSV

In [None]:
with open('/content/drive/MyDrive/IR Package/data/news.csv', 'a') as csvFile:
  writer = csv.DictWriter(csvFile, fieldnames = fields)
  for i in range(len(pp)):
    writer.writerow(pp[i])

# IR Model

## Importing libraries for IR Model

In [None]:
import collections
import math
import pandas as pd

## Getting data from CSV

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IR Package/data/news.csv')

## Boolean Model

In [None]:
invertedIndex = {}

In [None]:
for i in range(df.shape[0]):
  dat = df['text'][i]
  data = dat.split(' ')
  for token in data:
      if token in invertedIndex:
          if i not in invertedIndex[token]:
              invertedIndex[token].append(i)
      else:
          invertedIndex[token] = [i]

In [None]:
len(invertedIndex)

11623

In [None]:
query = str(input('Enter your query: '))
query = query.lower()

d = set()
word_tokens = nltk.word_tokenize(query.lower())
for i in word_tokens:
  if invertedIndex[i]:
    if len(d) == 0:
      d = d | set(invertedIndex[i])
    else:
      d = d & set(invertedIndex[i])

d = list(d)
d.sort()

for i in d:
  print(df['title'][i])
  print(df['url'][i])
  print('\n')

In [None]:
len(invertedIndex['dhoni'])

411

## Vector Space Model

### Generate Term-Document Matrix

In [None]:
termDocumentMatrix = []
documentfrequency = {x: len(invertedIndex[x]) for x in list(invertedIndex.keys())}

In [None]:
for i in range(df.shape[0]):
  d = dict.fromkeys(list(invertedIndex.keys()), 0)
  text = df['text'][i]

  f = dict(collections.Counter(text.split(' ')))
  freq = {x: d.get(x, 0) + f.get(x, 0) for x in set(d).union(f)}
  termDocumentMatrix.append(freq)

### Generate document vector with tf-idf formula

In [None]:
N = df.shape[0]
documentVectors = []

In [None]:
for i in range(N):
  maxf = max(list(termDocumentMatrix[i].values()))
  vector = {}

  for term in termDocumentMatrix[i].keys():
    document_frequency = documentfrequency[term]
    tf = termDocumentMatrix[i][term]
    data = (tf / maxf) * math.log10((N + 1) / (0.5 + document_frequency))
    vector[term] = data

  documentVectors.append(vector)

### Cosine similarity function for ranking

In [None]:
def cosineSimilarity(doc1, doc2):
  num = 0
  doc1 = {k:v for k, v in doc1.items() if v > 0}
  doc2 = {k:v for k, v in doc2.items() if v > 0}

  common_terms = list(set(doc1.keys()) & set(doc2.keys()))
  for term in common_terms:
    num += (doc1[term] * doc2[term])

  d1 = math.sqrt(sum(map(lambda i : i * i, doc1.values())))
  d2 = math.sqrt(sum(map(lambda i : i * i, doc2.values())))

  cos_sim = num / (d1 * d2)

  return round(cos_sim, 4)

### Document Vectors to Clusters for K Means Clustering

In [None]:
dv = pd.DataFrame(documentVectors)

### Importing K-Means Clustering Library

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### Elbow method to find the optimal clusters

In [None]:
def elbow_method(Y_sklearn):

    number_clusters = range(1, 50)
    kmeans = [KMeans(n_clusters=i, max_iter = 300, n_init = 10, random_state = 0, verbose = True) for i in number_clusters]

    score = [kmeans[i].fit(Y_sklearn).inertia_ for i in range(len(kmeans))]

    plt.plot(number_clusters, score)
    plt.xlabel('Number of Clusters')
    plt.ylabel('wcss')
    plt.title('Elbow Method')
    plt.show()

elbow_method(dv)

Initialization complete
Iteration 0, inertia 18581.32910072862.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 20790.40804307319.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 17314.832545652655.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 29494.36846257533.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 20263.77874588626.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 18417.243558119335.
Iteration 1, inertia 16219.796133458167.
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 110232.03850654222.
Iteration 1, inertia 16219.

KeyboardInterrupt: ignored

### Selection of K and Modelling with Document Vectors

In [None]:
k = 10
model = KMeans(n_clusters = k, max_iter = 300, n_init = 1, random_state = 0, verbose = True)
model.fit(dv)

Initialization complete
Iteration 0, inertia 17001.64071030914
Iteration 1, inertia 14915.799682965853
Iteration 2, inertia 14821.402963133727
Iteration 3, inertia 14738.590167569037
Iteration 4, inertia 14714.856865398788
Iteration 5, inertia 14686.354832037783
Iteration 6, inertia 14633.647979958105
Iteration 7, inertia 14613.92694855216
Iteration 8, inertia 14482.528126630075
Iteration 9, inertia 14239.56440662048
Iteration 10, inertia 14213.571708764994
Iteration 11, inertia 14212.7440106169
Iteration 12, inertia 14212.474667882503
Iteration 13, inertia 14212.463057583627
Converged at iteration 13: strict convergence.


KMeans(n_clusters=10, n_init=1, random_state=0, verbose=True)

In [None]:
collections.Counter(model.labels_)

Counter({0: 619,
         1: 51,
         2: 30,
         3: 31,
         4: 3,
         5: 1388,
         6: 1,
         7: 40,
         8: 19,
         9: 671})

### Providing Cluster ID for Each Document Vector

In [None]:
dv['clusterID'] = model.labels_

### Relevant Documents based on query

In [None]:
## Preprocessing of query

query = str(input('Query: '))
word_tokens = preprocess(query)

## Calculation query frequency

d = dict.fromkeys(list(invertedIndex.keys()), 0)
f = dict(collections.Counter(word_tokens))
query_freq = {x: d.get(x, 0) + f.get(x, 0) for x in set(d).union(f)}

## Calculation of td-if of query

maxf = max(list(query_freq.values()))
query_vector = {}

for term in query_freq.keys():
  document_frequency = documentfrequency[term]
  tf = query_freq[term]
  data = (tf / maxf) * math.log10((N + 1) / (0.5 + document_frequency))
  query_vector[term] = data


## Finding cluster for the given query

p = model.predict(pd.DataFrame([query_vector]))

## Find all documents in cluster where query is present

cid = p[0]
rec = {}
for cid in p:
  qdf = dv.query("clusterID == @cid")
  qdict = qdf.to_dict('index')
  rec = {**rec, **qdict}

## Similarity scores of the retrieved documents with query

scores = {}
for k in rec:
  score = cosineSimilarity(rec[k], query_vector)
  if score > 0:
    scores[k] = score

print('\n')

## Ranking of documents based on highest similarity

scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse = True))
for i in list(scores.keys())[:30]:
  print(df['title'][i])
  print(df['url'][i])
  print('\n')

Query: Ankit Sharma


Gujarat Lions bring in Irfan Pathan and Ankit Soni as replacements
https://www.iplt20.com/news/98245


Ankit Rajpoot traded to RR; Trent Boult traded to MI
https://www.iplt20.com/news/197297


Rising Pune Supergiant acquire Shardul Thakur
https://www.iplt20.com/news/38734


Teams announce squads for IPL 2013
https://www.iplt20.com/news/40563


Howstat IPL match ID 441
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=0441


Howstat IPL match ID 448
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=0448


Howstat IPL match ID 511
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=0511


Howstat IPL match ID 496
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=0496


Howstat IPL match ID 287
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCode=0287


Howstat IPL match ID 534
http://www.howstat.com/cricket/Statistics/IPL/MatchScorecard.asp?MatchCo