# Prediction using PySpark

In [1]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

try:
    sc.stop()
except NameError:
    print("sc not defined")

config = SparkConf().setMaster("local[*]").setAppName("ClassifyUrl")    
sc = SparkContext(conf=config)

sc not defined


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local") \
                    .appName("Classify Urls") \
                    .getOrCreate()

In [3]:
train_data = 'bigdata/train.csv'
html_data = 'bigdata/train/html_data.csv'

In [23]:
%%time
train = spark.read.csv('bigdata/train.csv', 
                       header=True,
                      inferSchema=True)

# Analyze Data types in dataset
train.printSchema()

root
 |-- Webpage_id: integer (nullable = true)
 |-- Domain: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Tag: string (nullable = true)

Wall time: 473 ms


In [24]:
print('Total Records in Training Dataset :', train.count())
train.head(5)

Total Records in Training Dataset : 53447


[Row(Webpage_id=1, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting', Tag='news'),
 Row(Webpage_id=2, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says', Tag='news'),
 Row(Webpage_id=3, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker', Tag='news'),
 Row(Webpage_id=4, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no', Tag='news'),
 Row(Webpage_id=5, Domain='www.fiercepharma.com', Url='http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness', Tag='news')]

In [25]:
# head() gives Ugly Output :(
# Prefer show() over head()
train.show(5)

+----------+--------------------+--------------------+----+
|Webpage_id|              Domain|                 Url| Tag|
+----------+--------------------+--------------------+----+
|         1|www.fiercepharma.com|http://www.fierce...|news|
|         2|www.fiercepharma.com|http://www.fierce...|news|
|         3|www.fiercepharma.com|http://www.fierce...|news|
|         4|www.fiercepharma.com|http://www.fierce...|news|
|         5|www.fiercepharma.com|http://www.fierce...|news|
+----------+--------------------+--------------------+----+
only showing top 5 rows



In [7]:
# How many columns do we have in train and what are their names?
print('No. of cols in train dataset : ', len(train.columns))
train.columns

No. of cols in train dataset :  4


['Webpage_id', 'Domain', 'Url', 'Tag']

In [9]:
# How to get the summary statistics (mean, standard deviance, min ,max , count) of numerical columns in a DataFrame?
train.describe().show()

'''
Observation:
As we can see that, describe operation is working for String type column but the output for mean, stddev are null and min & max values are calculated based on ASCII value of categories.
'''

+-------+------------------+--------------------+--------------------+--------------+
|summary|        Webpage_id|              Domain|                 Url|           Tag|
+-------+------------------+--------------------+--------------------+--------------+
|  count|             53447|               53447|               53447|         53447|
|   mean| 39920.78603102139|                null|                null|          null|
| stddev|22945.942450142324|                null|                null|          null|
|    min|                 1|  1.eyefortravel.com|http://1.eyefortr...|clinicalTrials|
|    max|              9999|zoonosis.conferen...|https://zoologica...|        thesis|
+-------+------------------+--------------------+--------------------+--------------+



'\nObservation:\nAs we can see that, describe operation is working for String type column but the output for mean, stddev are null and min & max values are calculated based on ASCII value of categories.\n'

In [19]:
%%time
# How to select column(s) from the DataFrame?
train.select('Webpage_id','Domain','Tag').show(5)

+----------+--------------------+----+
|Webpage_id|              Domain| Tag|
+----------+--------------------+----+
|         1|www.fiercepharma.com|news|
|         2|www.fiercepharma.com|news|
|         3|www.fiercepharma.com|news|
|         4|www.fiercepharma.com|news|
|         5|www.fiercepharma.com|news|
+----------+--------------------+----+
only showing top 5 rows

Wall time: 170 ms


In [13]:
%%time
# How to find the number of distinct Domain and Tags (Target-Classes) in train files?
train.select('Domain').distinct().count(), train.select('Tag').distinct().count()

Wall time: 9.7 s


(3974, 9)

In [37]:
# Check for Null values in 
print('Count of Missing values in Train Dataset :\n')
print('Domain :', train.filter(train.Domain.isNull()).count())
print('Url :', train.filter(train.Url.isNull()).count())
print('Tag :', train.filter(train.Tag.isNull()).count())

Count of Missing values in Train Dataset :

Domain : 0
Url : 0
Tag : 0


In [38]:
# How to drop the all rows with null value?
train.dropna().count() # Count of rows in newly returned non-null dataframes

53447

In [39]:
# How to fill the null values in Domain column of DataFrame with, some constant value, say, 'www.missing.in'?
missing_domain = 'www.missing.in'
train.fillna(missing_domain, 'Domain').count() # Count of rows in newly returned non-null dataframes

53447

In [46]:
# How to create a sample DataFrame from the base DataFrame?
train.sample(False, # withReplacement=False
             0.0001, # fraction = x percecntage that we want to pick
             42 # seed to reproduce the result
            ).show()

+----------+--------------------+--------------------+-----------+
|Webpage_id|              Domain|                 Url|        Tag|
+----------+--------------------+--------------------+-----------+
|     18472|cancerci.biomedce...|https://cancerci....|publication|
|     26364|www.naturalnews.c...|http://www.natura...|       news|
|     28191|         twitter.com|https://twitter.c...|     others|
+----------+--------------------+--------------------+-----------+



In [63]:
# How to find the number of rows we have per Domain?
from pyspark.sql.functions import col
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(col('count').desc()) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .sort(desc('count')) \
    .show(10) # Show Count of Top 10

# Or like below:
from pyspark.sql.functions import desc
train.groupBy('Domain') \
    .count() \
    .filter("`count` > 225") \
    .orderBy('count', ascending=False) \
    .show(10) # Show Count of Top 10

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nlm.nih.gov|  226|
+--------------------+-----+

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nlm.nih.gov|  226|
+--------------------+-----+

+--------------------+-----+
|              Domain|count|
+--------------------+-----+
|thesis.library.ca...|  301|
|academiccommons.c...|  300|
|  www.dart-europe.eu|  300|
|       curate.nd.edu|  300|
|      dspace.mit.edu|  300|
|ecommons.cornell.edu|  300|
|     www.nice.org.uk|  230|
|www.ncbi.nl