# Prediction using PySpark

In [None]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

try:
    sc.stop()
except NameError:
    print("sc not defined")

config = SparkConf().setMaster("local[*]").setAppName("ClassifyUrl")    
sc = SparkContext(conf=config)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local") \
                    .appName("Classify Urls") \
                    .getOrCreate()

In [None]:
train_csv = 'bigdata/train.csv'
html_csv = 'bigdata/train/html_data.csv'

In [None]:
%%time
train = spark.read.csv(train_csv, 
                       header=True,
                      inferSchema=True)

# Analyze Data types in dataset
train.printSchema()

In [None]:
print('Total Records in Training Dataset :', train.count())
train.head(5)

In [None]:
# head() gives Ugly Output :(
# Prefer show() over head()
train.show(5)

In [None]:
# How many columns do we have in train and what are their names?
print('No. of cols in train dataset : ', len(train.columns))
train.columns

In [None]:
# How to get the summary statistics (mean, standard deviance, min ,max , count) of numerical columns in a DataFrame?
train.describe().show()

'''
Observation:
As we can see that, describe operation is working for String type column but the output for mean, stddev are null and min & max values are calculated based on ASCII value of categories.
'''

In [None]:
%%time
# How to select column(s) from the DataFrame?
train.select('Webpage_id','Domain','Tag').show(5)

In [None]:
%%time
# How to find the number of distinct Domain and Tags (Target-Classes) in train files?
train.select('Domain').distinct().count(), train.select('Tag').distinct().count()

In [None]:
# Check for Null values in 
print('Count of Missing values in Train Dataset :\n')
print('Domain :', train.filter(train.Domain.isNull()).count())
print('Url :', train.filter(train.Url.isNull()).count())
print('Tag :', train.filter(train.Tag.isNull()).count())

In [None]:
# How to drop the all rows with null value?
train.dropna().count() # Count of rows in newly returned non-null dataframes

In [None]:
# How to fill the null values in Domain column of DataFrame with, some constant value, say, 'www.missing.in'?
missing_domain = 'www.missing.in'
train.fillna(missing_domain, 'Domain').count() # Count of rows in newly returned non-null dataframes

In [None]:
# How to create a sample DataFrame from the base DataFrame?
train.sample(False, # withReplacement=False
             0.0001, # fraction = x percecntage that we want to pick
             42 # seed to reproduce the result
            ).show()

In [None]:
# How to find the number of rows we have per Domain?
from pyspark.sql.functions import col
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(col('count').desc()) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(desc('count')) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .orderBy('count', ascending=False) \
    .show(10) # Show Count of Top 10

In [None]:
%%time 
# Wall time: 2.13s - 2.39s

# OBJECTIVE : Get just the domain from URLs
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import tldextract

def extract_domain(url):
    return tldextract.extract(url).domain

extract_domain_udf = udf(extract_domain, StringType())
# extract_domain_udf = udf(lambda url : tldextract.extract(url).domain, StringType())

train = train.withColumn('Domain', extract_domain_udf(train.Domain))
train.show(5)

In [None]:
from bs4 import BeautifulSoup # For Scraping HTML page
from bs4.element import Comment
import re

In [None]:
# Objective : Extract text from title tag of HTML source of web-page
def extract_title(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser')
    title_tag = soup.find('title')
    if (title_tag == None):
        title = None
    else:
        title = title_tag.text.strip()
    return title

In [None]:
#  OBJECTIVE: Functions to parse HTML content and extract text that matters.
def extract_body(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser', from_encoding="utf-8")
    body_tag = soup.find('body')
    if (body_tag == None):
        body = page 
    else:
        body = body_tag # What should be returned here? How to stringify this for further  procecssing?
    return body

def is_visible_content(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def remove_extra_spaces(str):
    return u" ".join(str.split())

def extract_text(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser') #, from_encoding="utf-8"
    texts = soup.findAll(text=True) # Extracts text from all HTML Markups, incl nested ones
    visible_texts = filter(is_visible_content, texts)
    # The u-prefix u" ".join() indicates Unicode and has been in python since v2.0
    # Ref. Read: https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/
    text = u" ".join(remove_extra_spaces(t.strip()) for t in visible_texts)
    text = text.replace(',','')
    text = text.replace('|','')
    text = re.sub(r'\s\s+',' ',text).strip()
    return text.encode('utf-8',errors='ignore').decode('utf-8').strip()

In [None]:
config.get("spark", "spark_parser_mode")

In [None]:
%%time

# OBJECTIVE : Read html_data.csv
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import lit # lit for literals

extract_text_udf = udf(extract_text, StringType())

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
schema = StructType([
    StructField("Webpage_id", IntegerType()),
    StructField("Html", StringType())
])

html_df = spark.read.csv(html_csv, 
                         header=True, 
                         multiLine=True, 
                         ignoreLeadingWhiteSpace=True, 
                         ignoreTrailingWhiteSpace=True, 
                         encoding="UTF-8",
                         sep=',',
                         quote='"', 
                         escape='"',
                         maxColumns=2,
#                          mode='spark_parser_mode',
#                          schema=schema)
                         inferSchema=True)
'''
html_df = spark.read.format('csv') \
                    .option('header',True) \
                    .option('ignoreLeadingWhiteSpace',True) \
                    .option('ignoreTrailingWhiteSpace',True) \
                    .option('inferSchema',True) \
                    .option('maxColumns',2) \
                    .load(html_csv)
'''
# Analyze Data types in dataset
html_df.printSchema()

In [None]:
%%time

# Adding a constant column 
html_df = html_df.withColumn('Title',lit(''))
html_df.show(5)

In [None]:
%%time
# Wall time: 1h 10min 6s

# OBJECTIVE : From html_data.csv loaded in df, extract title and text from html-page, and add the them to train.csv as new columns

# Transforming an existing column
html_df = html_df.withColumn('Html',extract_text_udf(html_df.Html))
html_df = html_df.withColumnRenamed('Html','Html2Text')
html_df.write.csv('bigdata/train/sparkoutput', header=True)