In [1]:
import findspark

<hr />
The following command adds the pyspark to sys.path at runtime. If the pyspark is not on the system path by default. It also prints the path of the spark.
<hr />

In [2]:
print(findspark.find())
findspark.init()

/opt/spark


<hr />
Create a Spark Session
<hr />

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pipeline") \
    .master('local[2]') \
    .getOrCreate()

23/01/20 18:53:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


<hr />
Read the dataset into a dataframe.
<hr />

In [4]:
df = spark.read.csv("udemy_dataset.csv",header=True,inferSchema=True)

                                                                                

<hr />
Display the dataset.
<hr />

In [5]:
df.show(truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------
 _c0                 | 0                                                                                 
 course_id           | 1070968                                                                           
 course_title        | Ultimate Investment Banking Course                                                
 url                 | https://www.udemy.com/ultimate-investment-banking-course/                         
 is_paid             | True                                                                              
 price               | 200                                                                               
 num_subscribers     | 2147                                                                              
 num_reviews         | 23                                                                                
 num_lectures        | 51                     

23/01/20 18:54:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , course_id, course_title, url, is_paid, price, num_subscribers, num_reviews, num_lectures, level, content_duration, published_timestamp, subject, clean_course_title
 Schema: _c0, course_id, course_title, url, is_paid, price, num_subscribers, num_reviews, num_lectures, level, content_duration, published_timestamp, subject, clean_course_title
Expected: _c0 but found: 
CSV file: file:///home/amits/environments/ModuleWisenotebooks/Module2SparkPipelines/udemy_dataset.csv


<hr />
Select the required input columns used for prediction.
<hr />

In [6]:
df = df.select('course_title','subject')
df.show(truncate=False)

+------------------------------------------------------------+----------------+
|course_title                                                |subject         |
+------------------------------------------------------------+----------------+
|Ultimate Investment Banking Course                          |Business Finance|
|Complete GST Course & Certification - Grow Your CA Practice |Business Finance|
|Financial Modeling for Business Analysts and Consultants    |Business Finance|
|Beginner to Pro - Financial Analysis in Excel 2017          |Business Finance|
|How To Maximize Your Profits Trading Options                |Business Finance|
|Trading Penny Stocks: A Guide for All Levels In 2017        |Business Finance|
|Investing And Trading For Beginners: Mastering Price Charts |Business Finance|
|Trading Stock Chart Patterns For Immediate, Explosive Gains |Business Finance|
|Options Trading 3 : Advanced Stock Profit and Success Method|Business Finance|
|The Only Investment Strategy You Need F

<hr />
Determine the count of records in the dataset.
<hr />

In [7]:
df.count()

3689

<hr />
Drop the rows with Null values.
<hr />

In [8]:
df.toPandas()['subject'].isnull().sum()
df = df.dropna(subset=('subject'))
df.count()

3683

<hr />
Split the dataset into Training and Testing.
<hr />

In [9]:
(trainDF,testDF) = df.randomSplit((0.7,0.3),seed=42)

<hr />
Import the pyspark modules required for pre-processing the data. <br>
1. Tokenizer : To create tokens from the sentence <br>
2. StopWordsRemover : To remove the stop words in the sentence <br>
3. CountVectorizer : Extracts a vocabulary from dataset and generates a vectorized model with the count of occurance <br>
4. IDF : Compute the Inverse Document Frequency (IDF) given a dataset. <br>
5. StringIndexer : A label indexer that maps a string column of labels to an ML column of label indices. <br>
<hr />

In [10]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.feature import StringIndexer

<hr />
Initialzie the Estimators and Transformers.
<hr />

In [11]:
tokenizer = Tokenizer(inputCol='course_title',outputCol='mytokens')
stopwords_remover = StopWordsRemover(inputCol='mytokens',outputCol='filtered_tokens')
vectorizer = CountVectorizer(inputCol='filtered_tokens',outputCol='rawFeatures')
idf = IDF(inputCol='rawFeatures',outputCol='vectorizedFeatures')

In [12]:
labelEncoder = StringIndexer(inputCol='subject',outputCol='label')

<hr />
Import the pyspark modules required for training the model.
<hr />

In [13]:
from pyspark.ml.classification import LogisticRegression

In [14]:
lr = LogisticRegression(featuresCol='vectorizedFeatures',labelCol='label')

<hr />
Create a Pipeline.
<hr />

In [15]:
from pyspark.ml import Pipeline 

In [16]:
pipeline = Pipeline(stages=[tokenizer,stopwords_remover,vectorizer,idf,labelEncoder,lr])

<hr />
Call the fit function for executing the pipeline and generating the trained model.
<hr />

In [17]:
lr_model = pipeline.fit(trainDF)

23/01/20 18:54:48 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/01/20 18:54:48 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


<hr />
Display the Stages of the pipeline.
<hr />

In [18]:
lr_model.stages

[Tokenizer_9c30f90f0a83,
 StopWordsRemover_46b26e865fb2,
 CountVectorizerModel: uid=CountVectorizer_27489aa75d10, vocabularySize=3670,
 IDFModel: uid=IDF_25c62948c5c3, numDocs=2631, numFeatures=3670,
 StringIndexerModel: uid=StringIndexer_c5fd7ba0a0f8, handleInvalid=error,
 LogisticRegressionModel: uid=LogisticRegression_cd211b1815de, numClasses=8, numFeatures=3670]

<hr />
Use the pipeline to generate predictions for the test data.
<hr />

In [19]:
predictions = lr_model.transform(testDF.select('course_title'))

23/01/20 18:54:50 WARN StringIndexerModel: Input column subject does not exist during transformation. Skip StringIndexerModel for this column.


<hr />
Display the predictions.
<hr />

In [20]:
predictions.show(vertical=True)

-RECORD 0----------------------------------
 course_title       | #12 Hand Coordina... 
 mytokens           | [#12, hand, coord... 
 filtered_tokens    | [#12, hand, coord... 
 rawFeatures        | (3670,[394,491,60... 
 vectorizedFeatures | (3670,[394,491,60... 
 rawPrediction      | [7.02282674468303... 
 probability        | [0.82104939581835... 
 prediction         | 0.0                  
-RECORD 1----------------------------------
 course_title       | #7 Piano Hand Coo... 
 mytokens           | [#7, piano, hand,... 
 filtered_tokens    | [#7, piano, hand,... 
 rawFeatures        | (3670,[9,13,60,23... 
 vectorizedFeatures | (3670,[9,13,60,23... 
 rawPrediction      | [-1.9893273622912... 
 probability        | [1.19573187492450... 
 prediction         | 2.0                  
-RECORD 2----------------------------------
 course_title       | 'Greensleeves' Cr... 
 mytokens           | ['greensleeves', ... 
 filtered_tokens    | ['greensleeves', ... 
 rawFeatures        | (3670,[6,9

In [21]:
predictions = lr_model.transform(testDF)
predictions.show(vertical=True)

-RECORD 0----------------------------------
 course_title       | #12 Hand Coordina... 
 subject            | Musical Instruments  
 mytokens           | [#12, hand, coord... 
 filtered_tokens    | [#12, hand, coord... 
 rawFeatures        | (3670,[394,491,60... 
 vectorizedFeatures | (3670,[394,491,60... 
 label              | 2.0                  
 rawPrediction      | [7.02282674468303... 
 probability        | [0.82104939581835... 
 prediction         | 0.0                  
-RECORD 1----------------------------------
 course_title       | #7 Piano Hand Coo... 
 subject            | Musical Instruments  
 mytokens           | [#7, piano, hand,... 
 filtered_tokens    | [#7, piano, hand,... 
 rawFeatures        | (3670,[9,13,60,23... 
 vectorizedFeatures | (3670,[9,13,60,23... 
 label              | 2.0                  
 rawPrediction      | [-1.9893273622912... 
 probability        | [1.19573187492450... 
 prediction         | 2.0                  
-RECORD 2-----------------------

In [22]:
spark.stop()