In [1]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.csv('./Desktop/123/df.csv',
                    sep='\t', inferSchema=True, header=True)

In [5]:
df = df.select('label','url')

In [6]:
df.columns

['label', 'url']

In [8]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[label: int, url: string]>

In [9]:
df.groupby('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
| null|      2|
|    1| 552302|
|    0|1879621|
+-----+-------+



In [10]:
df = df.dropna()
df.groupby('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|    1| 552302|
|    0|1879621|
+-----+-------+



In [11]:
from pyspark.sql.functions import regexp_extract

In [12]:
df = df.select('label',regexp_extract('url',"//(.*?)/",1).alias('url'))
df.show(5,False)

+-----+--------------+
|label|url           |
+-----+--------------+
|1    |www.kaola.com |
|1    |www.kaola.com |
|1    |www.acgjie.com|
|0    |www.ifanr.com |
|1    |bbs.55bbs.com |
+-----+--------------+
only showing top 5 rows



In [13]:
df = df.select('url',df.label.cast("float").alias('label'))
df.show(5,False)

+--------------+-----+
|url           |label|
+--------------+-----+
|www.kaola.com |1.0  |
|www.kaola.com |1.0  |
|www.acgjie.com|1.0  |
|www.ifanr.com |0.0  |
|bbs.55bbs.com |1.0  |
+--------------+-----+
only showing top 5 rows



In [14]:
from pyspark.ml.feature import RegexTokenizer

In [15]:
tokenizer =RegexTokenizer(inputCol='url',outputCol='split',pattern='\\W')
splitdf = tokenizer.transform(df)
splitdf.show(5,False)

+--------------+-----+------------------+
|url           |label|split             |
+--------------+-----+------------------+
|www.kaola.com |1.0  |[www, kaola, com] |
|www.kaola.com |1.0  |[www, kaola, com] |
|www.acgjie.com|1.0  |[www, acgjie, com]|
|www.ifanr.com |0.0  |[www, ifanr, com] |
|bbs.55bbs.com |1.0  |[bbs, 55bbs, com] |
+--------------+-----+------------------+
only showing top 5 rows



In [None]:
#from pyspark.ml.feature import StopWordsRemover

#remover = StopWordsRemover(inputCol='split', outputCol='filtered',stopWords=['com','cn'])
#filterdf = remover.transform(splitdf)
#filterdf.show(5,False)

In [16]:
splitdf.filter(' url not like "%.%" ').show(10)

+---+-----+-----+
|url|label|split|
+---+-----+-----+
|   |  0.0|   []|
|   |  0.0|   []|
|   |  0.0|   []|
|   |  0.0|   []|
|   |  1.0|   []|
|   |  0.0|   []|
|   |  1.0|   []|
|   |  1.0|   []|
|   |  1.0|   []|
|   |  0.0|   []|
+---+-----+-----+
only showing top 10 rows



In [17]:
splitdf = splitdf.filter(' url != "" ')

In [18]:
splitdf.show(5)

+--------------+-----+------------------+
|           url|label|             split|
+--------------+-----+------------------+
| www.kaola.com|  1.0| [www, kaola, com]|
| www.kaola.com|  1.0| [www, kaola, com]|
|www.acgjie.com|  1.0|[www, acgjie, com]|
| www.ifanr.com|  0.0| [www, ifanr, com]|
| bbs.55bbs.com|  1.0| [bbs, 55bbs, com]|
+--------------+-----+------------------+
only showing top 5 rows



In [None]:
splitdf.count()

In [19]:
from pyspark.ml.feature import CountVectorizer

count = CountVectorizer(inputCol='split', outputCol="rawFeatures")
model = count.fit(splitdf)
featuredf = model.transform(splitdf)
featuredf.show(5,False)

+--------------+-----+------------------+----------------------------------+
|url           |label|split             |rawFeatures                       |
+--------------+-----+------------------+----------------------------------+
|www.kaola.com |1.0  |[www, kaola, com] |(262144,[0,3,24],[1.0,1.0,1.0])   |
|www.kaola.com |1.0  |[www, kaola, com] |(262144,[0,3,24],[1.0,1.0,1.0])   |
|www.acgjie.com|1.0  |[www, acgjie, com]|(262144,[0,3,13263],[1.0,1.0,1.0])|
|www.ifanr.com |0.0  |[www, ifanr, com] |(262144,[0,3,525],[1.0,1.0,1.0])  |
|bbs.55bbs.com |1.0  |[bbs, 55bbs, com] |(262144,[0,58,87],[1.0,1.0,1.0])  |
+--------------+-----+------------------+----------------------------------+
only showing top 5 rows



In [None]:
#featuredf.na.drop(how="all").show(5)    

In [22]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featuredf)
finaldf = idfModel.transform(featuredf)
# We want only the label and features columns for our machine learning models
fdf = finaldf.select("label", "features")

In [23]:
fdf.show(2)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262144,[0,3,24],...|
|  1.0|(262144,[0,3,24],...|
+-----+--------------------+
only showing top 2 rows



In [24]:
seed = 0  # set seed for reproducibility
trainDF, testDF = fdf.randomSplit([0.8, 0.2], seed)

In [25]:
trainDF.count(),testDF.count()

(1934062, 482583)

In [26]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
lr = LogisticRegression(maxIter=10)

paramGrid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, np.linspace(0.3, 0.01, 10)) \
    .addGrid(lr.elasticNetParam, np.linspace(0.3, 0.8, 6)) \
    .build()
crossval_lr = CrossValidator(estimator=lr,
                             estimatorParamMaps=paramGrid_lr,
                             evaluator=BinaryClassificationEvaluator(),
                             numFolds=5)
cvModel_lr = crossval_lr.fit(trainDF)
best_model_lr = cvModel_lr.bestModel.summary
best_model_lr.predictions.columns

['label', 'features', 'rawPrediction', 'probability', 'prediction']

# Area under the curve for the training data

In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval_lr = BinaryClassificationEvaluator(
    rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
my_eval_lr.evaluate(best_model_lr.predictions)

0.6901801693452801

# f1 score

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
my_mc_lr = MulticlassClassificationEvaluator(
    predictionCol='prediction', labelCol='label', metricName='f1')
my_mc_lr.evaluate(best_model_lr.predictions)

0.8239634852679478

# Accuracy

In [29]:
my_mc_lr = MulticlassClassificationEvaluator(
    predictionCol='prediction', labelCol='label', metricName='accuracy')
my_mc_lr.evaluate(best_model_lr.predictions)

0.8446575135647151

# Recall & Accuracy

In [30]:
train_fit_lr = best_model_lr.predictions.select('label', 'prediction')
train_fit_lr.groupBy('label', 'prediction').count().show()

+-----+----------+-------+
|label|prediction|  count|
+-----+----------+-------+
|  1.0|       1.0| 178855|
|  0.0|       1.0|  39932|
|  1.0|       0.0| 260510|
|  0.0|       0.0|1454765|
+-----+----------+-------+



In [31]:
(178855+1454765)/(178855+39932+260510+1454765)

0.8446575135647151

# Predict using the test data and evaluate the predictions

In [33]:
predictions_lr = cvModel_lr.transform(testDF)
predictions_lr.columns

['label', 'features', 'rawPrediction', 'probability', 'prediction']

In [34]:
predictions_lr.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(262144,[0],[0.22...|[1.73724937097366...|[0.85033734734920...|       0.0|
|  0.0|(262144,[0],[0.22...|[1.73724937097366...|[0.85033734734920...|       0.0|
|  0.0|(262144,[0],[0.22...|[1.73724937097366...|[0.85033734734920...|       0.0|
|  0.0|(262144,[0],[0.22...|[1.73724937097366...|[0.85033734734920...|       0.0|
|  0.0|(262144,[0],[0.22...|[1.73724937097366...|[0.85033734734920...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



# Show sample predictions:

In [35]:
predictions_lr.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 5 rows



# Recall & Accuracy

In [36]:
predictions_lr.groupBy('label', 'prediction').count().show()

+-----+----------+------+
|label|prediction| count|
+-----+----------+------+
|  1.0|       0.0| 65010|
|  0.0|       0.0|362879|
|  1.0|       1.0| 44651|
|  0.0|       1.0| 10043|
+-----+----------+------+



# Accuracy with the test data

In [37]:
(362879+44651)/(65010+362879+44651+10043)

0.8444764941989253

In [38]:
my_mc_lr = MulticlassClassificationEvaluator(
    predictionCol='prediction', labelCol='label', metricName='accuracy')
my_mc_lr.evaluate(predictions_lr)

0.8444764941989253

In [None]:
#df.write.csv("./Desktop/123.txt")