# Prediction using PySpark

In [1]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

try:
    sc.stop()
except NameError:
    print("sc not defined")

config = SparkConf().setMaster("local[*]").setAppName("ClassifyUrl")    
sc = SparkContext(conf=config)

sc not defined


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local") \
                    .appName("Classify Urls") \
                    .getOrCreate()

In [3]:
train_csv = 'bigdata/train.csv'
html_csv = 'bigdata/train/html_data.csv'

In [4]:
%%time
train = spark.read.csv(train_csv, 
                       header=True,
                      inferSchema=True)

# Analyze Data types in dataset
train.printSchema()

root
 |-- Webpage_id: integer (nullable = true)
 |-- Domain: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Tag: string (nullable = true)

Wall time: 7.2 s


In [5]:
print('Total Records in Training Dataset :', train.count())
train.head(5)

Total Records in Training Dataset : 53447


[Row(Webpage_id=1, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting', Tag='news'),
 Row(Webpage_id=2, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says', Tag='news'),
 Row(Webpage_id=3, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker', Tag='news'),
 Row(Webpage_id=4, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no', Tag='news'),
 Row(Webpage_id=5, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness', Tag='news')]

In [6]:
# head() gives Ugly Output :(
# Prefer show() over head()
train.show(5)

+----------+--------------------+--------------------+----+
|Webpage_id|              Domain|                 Url| Tag|
+----------+--------------------+--------------------+----+
|         1|www.fiercepharma.com|http://www.fierce...|news|
|         2|www.fiercepharma.com|http://www.fierce...|news|
|         3|www.fiercepharma.com|http://www.fierce...|news|
|         4|www.fiercepharma.com|http://www.fierce...|news|
|         5|www.fiercepharma.com|http://www.fierce...|news|
+----------+--------------------+--------------------+----+
only showing top 5 rows



In [7]:
# How many columns do we have in train and what are their names?
print('No. of cols in train dataset : ', len(train.columns))
train.columns

No. of cols in train dataset :  4


['Webpage_id', 'Domain', 'Url', 'Tag']

In [8]:
# How to get the summary statistics (mean, standard deviance, min ,max , count) of numerical columns in a DataFrame?
train.describe().show()

'''
Observation:
As we can see that, describe operation is working for String type column but the output for mean, stddev are null and min & max values are calculated based on ASCII value of categories.
'''

+-------+------------------+--------------------+--------------------+--------------+
|summary|        Webpage_id|              Domain|                 Url|           Tag|
+-------+------------------+--------------------+--------------------+--------------+
|  count|             53447|               53447|               53447|         53447|
|   mean| 39920.78603102139|                null|                null|          null|
| stddev|22945.942450142324|                null|                null|          null|
|    min|                 1|  1.eyefortravel.com|http://1.eyefortr...|clinicalTrials|
|    max|             79345|zoonosis.conferen...|https://zoologica...|        thesis|
+-------+------------------+--------------------+--------------------+--------------+



'\nObservation:\nAs we can see that, describe operation is working for String type column but the output for mean, stddev are null and min & max values are calculated based on ASCII value of categories.\n'

In [9]:
%%time
# How to select column(s) from the DataFrame?
train.select('Webpage_id','Domain','Tag').show(5)

+----------+--------------------+----+
|Webpage_id|              Domain| Tag|
+----------+--------------------+----+
|         1|www.fiercepharma.com|news|
|         2|www.fiercepharma.com|news|
|         3|www.fiercepharma.com|news|
|         4|www.fiercepharma.com|news|
|         5|www.fiercepharma.com|news|
+----------+--------------------+----+
only showing top 5 rows

Wall time: 400 ms


In [10]:
%%time
# How to find the number of distinct Domain and Tags (Target-Classes) in train files?
train.select('Domain').distinct().count(), train.select('Tag').distinct().count()

Wall time: 10.9 s


(3974, 9)

In [11]:
# Check for Null values in 
print('Count of Missing values in Train Dataset :\n')
print('Domain :', train.filter(train.Domain.isNull()).count())
print('Url :', train.filter(train.Url.isNull()).count())
print('Tag :', train.filter(train.Tag.isNull()).count())

Count of Missing values in Train Dataset :

Domain : 0
Url : 0
Tag : 0


In [12]:
# How to drop the all rows with null value?
train.dropna().count() # Count of rows in newly returned non-null dataframes

53447

In [13]:
# How to fill the null values in Domain column of DataFrame with, some constant value, say, 'www.missing.in'?
missing_domain = 'www.missing.in'
train.fillna(missing_domain, 'Domain').count() # Count of rows in newly returned non-null dataframes

53447

In [14]:
# How to create a sample DataFrame from the base DataFrame?
train.sample(False, # withReplacement=False
             0.0001, # fraction = x percecntage that we want to pick
             42 # seed to reproduce the result
            ).show()

+----------+--------------------+--------------------+-----------+
|Webpage_id|              Domain|                 Url|        Tag|
+----------+--------------------+--------------------+-----------+
|     18472|cancerci.biomedce...|https://cancerci....|publication|
|     26364|www.naturalnews.c...|http://www.natura...|       news|
|     28191|         twitter.com|https://twitter.c...|     others|
+----------+--------------------+--------------------+-----------+



In [15]:
# How to find the number of rows we have per Domain?
from pyspark.sql.functions import col
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(col('count').desc()) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(desc('count')) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .orderBy('count', ascending=False) \
    .show(10) # Show Count of Top 10

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nlm.nih.gov|  226|
+--------------------+-----+

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nlm.nih.gov|  226|
+--------------------+-----+

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nl

In [16]:
%%time 
# Wall time: 2.13s - 2.39s

# OBJECTIVE : Get just the domain from URLs
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import tldextract

def extract_domain(url):
    return tldextract.extract(url).domain

extract_domain_udf = udf(extract_domain, StringType())
# extract_domain_udf = udf(lambda url : tldextract.extract(url).domain, StringType())

train = train.withColumn('Domain', extract_domain_udf(train.Domain))
train.show(5)

+----------+------------+--------------------+----+
|Webpage_id|      Domain|                 Url| Tag|
+----------+------------+--------------------+----+
|         1|fiercepharma|http://www.fierce...|news|
|         2|fiercepharma|http://www.fierce...|news|
|         3|fiercepharma|http://www.fierce...|news|
|         4|fiercepharma|http://www.fierce...|news|
|         5|fiercepharma|http://www.fierce...|news|
+----------+------------+--------------------+----+
only showing top 5 rows

Wall time: 3.82 s


In [17]:
from bs4 import BeautifulSoup # For Scraping HTML page
from bs4.element import Comment
import re

In [18]:
# Objective : Extract text from title tag of HTML source of web-page
def extract_title(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser')
    title_tag = soup.find('title')
    if (title_tag == None):
        title = None
    else:
        title = title_tag.text.encode('utf-8',errors='ignore').decode('utf-8').strip()
    return title

In [19]:
#  OBJECTIVE: Functions to parse HTML content and extract text that matters.
def extract_body(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser', from_encoding="utf-8")
    body_tag = soup.find('body')
    if (body_tag == None):
        body = page 
    else:
        body = body_tag # What should be returned here? How to stringify this for further  procecssing?
    return body

def is_visible_content(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def remove_extra_spaces(str):
    return u" ".join(str.split())

def extract_text(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser') #, from_encoding="utf-8"
    texts = soup.findAll(text=True) # Extracts text from all HTML Markups, incl nested ones
    visible_texts = filter(is_visible_content, texts)
    # The u-prefix u" ".join() indicates Unicode and has been in python since v2.0
    # Ref. Read: https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/
    text = u" ".join(remove_extra_spaces(t.strip()) for t in visible_texts)
    text = text.replace(',','')
    text = text.replace('|','')
    text = re.sub(r'\s\s+',' ',text).strip()
    return text.encode('utf-8',errors='ignore').decode('utf-8').strip()

In [20]:
config.get("spark", "spark_parser_mode")

'spark_parser_mode'

In [25]:
%%time

# OBJECTIVE : Read html_data.csv
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import lit # lit for literals

extract_text_udf = udf(extract_text, StringType())

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
schema = StructType([
    StructField("Webpage_id", IntegerType()),
    StructField("Html", StringType())
])

html_df = spark.read.csv(html_csv, 
                         header=True, 
                         multiLine=True, 
                         ignoreLeadingWhiteSpace=True, 
                         ignoreTrailingWhiteSpace=True, 
                         encoding="UTF-8",
                         sep=',',
                         quote='"', 
                         escape='"',
                         maxColumns=2,
#                          mode='spark_parser_mode',
#                          schema=schema)
                         inferSchema=True)
'''
html_df = spark.read.format('csv') \
                    .option('header',True) \
                    .option('ignoreLeadingWhiteSpace',True) \
                    .option('ignoreTrailingWhiteSpace',True) \
                    .option('inferSchema',True) \
                    .option('maxColumns',2) \
                    .load(html_csv)
'''
# Analyze Data types in dataset
html_df.printSchema()

root
 |-- Webpage_id: integer (nullable = true)
 |-- Html: string (nullable = true)

Wall time: 1min 44s


In [26]:
%%time

# Adding a constant column 
html_df = html_df.withColumn('Title',lit(''))
html_df.show(5)

+----------+--------------------+-----+
|Webpage_id|                Html|Title|
+----------+--------------------+-----+
|         1|<!DOCTYPE html>
<...|     |
|         2|<!DOCTYPE html>
<...|     |
|         3|<!DOCTYPE html>
<...|     |
|         4|<!DOCTYPE html>
<...|     |
|         5|<!DOCTYPE html>
<...|     |
+----------+--------------------+-----+
only showing top 5 rows

Wall time: 2min 43s


In [27]:
%%time
# Wall time: 1h 10min 6s

# OBJECTIVE : From html_data.csv loaded in df, extract title and text from html-page, and add the them to train.csv as new columns

# Transforming an existing column
html_df = html_df.withColumn('Html',extract_text_udf(html_df.Html))
html_df = html_df.withColumnRenamed('Html','Html2Text')
html_df.write.csv('bigdata/train/sparkoutput', header=True)

AttributeError: 'DataFrame' object has no attribute 'shape'

In [30]:
html_df.count() # 79345

79345

In [32]:
html_df.head(10)[9]

Row(Webpage_id=10, Html2Text='Skip to main content Twitter LinkedIn Search Top Menu DDF 2017 FierceBiotech Jobs Resources Events Subscribe Main navigation Pharma M&A Regulatory Financials Corporate Legal Manufacturing M&A Outsourcing Regulatory Supply Chain Partnering Drug Safety Marketing Regulatory DTC Advertising Digital and Social Media Data and Analytics Launches Pharma Asia M&A R&D Regulatory Sales and Marketing Financials Manufacturing Animal Health R&D M&A Regulatory Veterinarian Financials Vaccines Drug Delivery R&D Regulatory Partnering Vaccines Deals Infectious Diseases R&D Regulatory Main navigation - Mobile Pharma M&A Regulatory Financials Corporate Legal Manufacturing M&A Outsourcing Regulatory Supply Chain Partnering Drug Safety Marketing Regulatory DTC Advertising Digital and Social Media Data and Analytics Launches Pharma Asia M&A R&D Regulatory Sales and Marketing Financials Manufacturing Animal Health R&D M&A Regulatory Veterinarian Financials Vaccines Drug Delivery 

In [None]:
sc.stop()