### Final version Scrapping Sudoku UK


In [12]:
from bs4 import BeautifulSoup
import urllib3
import requests 
import datetime
import numpy as np
import pandas as pd
from urllib.request import urlopen 
from urllib.error import HTTPError 
from urllib.error import URLError
from tqdm import tqdm

## FUNCTIONS

## Total_sudoku()
Find the total Sudoku to scrape

In [3]:
def total_sudoku():
    r = requests.get("http://www.sudoku.org.uk/Daily.asp")
    soup = BeautifulSoup(r.content, 'html5lib')
    return(soup.find('span', attrs={'class': 'newtitle'}
                     ).get_text().split(",")[0].replace("#", ""))


## List_dates():

To extract all the data we need to open each website that correspond an a diffrente day in which the solutions have been posted during the time and check if the website is workin or not and save the link in a list. 

In [4]:
def list_dates(total):
  dates = []
  dates_error = []
  urls = []
  for i in tqdm(range(2,total)):
    d = datetime.date.today()- datetime.timedelta(days = i)
    d_format = str(d.day)+'/'+str(d.month) +'/'+ str(d.year)
    URL = "http://www.sudoku.org.uk/DailySudoku.asp?solution=please&day="+d_format
    try:
      urlopen(URL)
    except HTTPError as e:
        pass
#       dates_error.append(d_format)
    except URLError as e:
        pass
#       dates_error.append(d_format)
    else:
#       dates.append(d_format)
      urls.append(URL)
  return urls


## Get_html()
Using the list of the days we're going to open each URL and extract all the HTML code.

In [5]:
def get_html(url):
  r = requests.get(url) 
  return BeautifulSoup(r.content,'html.parser')

## Consolidate()
extract all the sudokus and their solutions, the level of difficulty, number of people that solved the soduku and the average time in minutes.

In [6]:
def consolidate(urls):
    solution, sudoku, level, people,av_time, unit = ([] for i in range(6))

    for url in tqdm(urls):
        a, b = ([] for i in range(2))
        soup = get_html(url)
        for link in soup.find_all('td', attrs={'class': ['InnerTDone2','InnerTDone'] }):
            if link.attrs['class'] == ['InnerTDone2']:
              b.append(link.text)
            else:
              b.append('.')
            a.append(link.text)
            
        sudoku.append(''.join(b))
        solution.append(''.join(a))
             
        p = list(list(soup.table.td)[2])
        level.append(str(p[1].get_text()).split(", ")[1].split()[0])
        people.append(str(p[3]).split()[0])
        av_time.append(str(p[3]).split()[6])
        unit.append(str(p[3]).split()[7])
        
    return  urls, level, people, av_time, unit, sudoku, solution 

# Extract Data

In [7]:
## Extract list urls
urls = list_dates(pd.to_numeric(total_sudoku()))

100%|██████████| 5270/5270 [19:00<00:00,  4.62it/s]


#### However in the website says that they have 5270 puzzles (that means that the first post of a Sudoku was Feb 2006) but before March 7 of 2006 there are not registers about the players or the average time in the website, for this reason we're going to cut this urls.

In [8]:
## Select just the urls that have information that we need
index = urls.index([x for x in urls if '=7/3/2006' in x][0])

In [9]:
new_urls = urls[:index]

In [10]:
## Extract data
urls, level, people,av_time, unit, sudoku, solution = consolidate(new_urls)

100%|██████████| 5029/5029 [19:06<00:00,  4.39it/s]  


# Create Dataframe

In [11]:
df = pd.DataFrame(list(zip(urls, level, people,av_time, unit, sudoku, solution)), 
               columns =['URL', 'Level','People','Average-Time', 'Unit-Time', 'Sudoku', 'Solution']) 

In [12]:
df.head()

Unnamed: 0,URL,Level,People,Average-Time,Unit-Time,Sudoku,Solution
0,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,254,25,minutes,8...74...1.........7.5.9.4....65.4..73.....296...,8591742361648325973725698412916574837354186296...
1,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,281,12,minutes,.5.....7.9......6...21.75.8.4..69.3...8...4......,8539461729175823644621375981458692376283754197...
2,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,265,22,minutes,...45.2..3.........8....4..7...94.3.9..5.2.76....,6974532183248617955819274637186945329435128762...
3,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Tough,279,19,minutes,9......31.3...8.....5.7.9..3...4...8.916..54.4...,9274568316341982758153729643527496187916835424...
4,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,301,14,minutes,9.......2784.1.5.....4.....2.53.1.7...7...8......,9615387427846125395234796182953814761479568236...


In [13]:
df.shape

(5029, 7)

## Create Index


In [14]:
df['Id'] = df.index

In [15]:
df.head()

Unnamed: 0,URL,Level,People,Average-Time,Unit-Time,Sudoku,Solution,Id
0,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,254,25,minutes,8...74...1.........7.5.9.4....65.4..73.....296...,8591742361648325973725698412916574837354186296...,0
1,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,281,12,minutes,.5.....7.9......6...21.75.8.4..69.3...8...4......,8539461729175823644621375981458692376283754197...,1
2,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Diabolical,265,22,minutes,...45.2..3.........8....4..7...94.3.9..5.2.76....,6974532183248617955819274637186945329435128762...,2
3,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Tough,279,19,minutes,9......31.3...8.....5.7.9..3...4...8.916..54.4...,9274568316341982758153729643527496187916835424...,3
4,http://www.sudoku.org.uk/DailySudoku.asp?solut...,Moderate,301,14,minutes,9.......2784.1.5.....4.....2.53.1.7...7...8......,9615387427846125395234796182953814761479568236...,4


In [16]:
df = df[['Id', 'Level', 'Sudoku','Solution', 'People', 'Average-Time','Unit-Time','URL']]

# Function to visualize Sudoku

In [17]:
def split(element): 
    return [char for char in element]  

def transform_matrix(element):
  return np.reshape(split(element), (-1, 9))

In [18]:
transform_matrix(df.Solution[4])

array([['9', '6', '1', '5', '3', '8', '7', '4', '2'],
       ['7', '8', '4', '6', '1', '2', '5', '3', '9'],
       ['5', '2', '3', '4', '7', '9', '6', '1', '8'],
       ['2', '9', '5', '3', '8', '1', '4', '7', '6'],
       ['1', '4', '7', '9', '5', '6', '8', '2', '3'],
       ['6', '3', '8', '2', '4', '7', '9', '5', '1'],
       ['4', '5', '6', '1', '9', '3', '2', '8', '7'],
       ['3', '7', '2', '8', '6', '4', '1', '9', '5'],
       ['8', '1', '9', '7', '2', '5', '3', '6', '4']], dtype='<U1')

In [19]:
transform_matrix(df.Sudoku[4])

array([['9', '.', '.', '.', '.', '.', '.', '.', '2'],
       ['7', '8', '4', '.', '1', '.', '5', '.', '.'],
       ['.', '.', '.', '4', '.', '.', '.', '.', '.'],
       ['2', '.', '5', '3', '.', '1', '.', '7', '.'],
       ['.', '.', '7', '.', '.', '.', '8', '.', '.'],
       ['.', '3', '.', '2', '.', '7', '9', '.', '1'],
       ['.', '.', '.', '.', '.', '3', '.', '.', '.'],
       ['.', '.', '2', '.', '6', '.', '1', '9', '5'],
       ['8', '.', '.', '.', '.', '.', '.', '.', '4']], dtype='<U1')

# Export Data Frame

In [21]:
df.to_csv('data/data.csv')

In [9]:
df = pd.read_csv('data/data.csv')

## Create the Dataframe to use with the structure of the Database

In [10]:
df = df[['Id','Sudoku','Solution','Level','People','Average-Time']]

In [11]:
df.to_csv('data/dataset.csv')