In [74]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[4]") \
   .appName("testmodel") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext
# https://blog.csdn.net/wsp_1138886114/article/details/80634890?utm_source=blogxgwz0#step-3-%E5%88%9B%E5%BB%BApipline%E5%AE%9E%E4%BE%8B%E5%AF%B9%E8%B1%A1%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B%E4%B8%8E%E9%A2%84%E6%B5%8B

In [97]:
import numpy as np
import pandas as pd

np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 10000, 'display.max_rows', 10000)

test = pd.read_csv('data/test.csv')

In [98]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [99]:
k = pd.DataFrame()
k['test'] = test.isnull().sum()
k

Unnamed: 0,test
id,0
comment_text,0


In [100]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# print(string.punctuation)
def mytoken_lemm(text):
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    lemms = []
    for item in words:
        lemms.append(WordNetLemmatizer().lemmatize(item))
    return lemms

In [101]:
test['comment_text'] = test['comment_text'].map(lambda com : mytoken_lemm(com))

In [80]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"[yo, bitch, ja, rule, succesful, ever, whats, ..."
1,0000247867823ef7,"[rfc, title, fine, imo]"
2,00013b17ad220c46,"[source, zawe, ashton, lapland]"
3,00017563c3f7919a,"[look, back, source, information, updated, cor..."
4,00017695ad8997eb,"[nt, anonymously, edit, article]"


In [81]:
test_df = spark.createDataFrame(test)

In [82]:
test_df.dtypes

[('id', 'string'), ('comment_text', 'array<string>')]

In [83]:
from pyspark.ml.pipeline import PipelineModel
load_pipeline_model = PipelineModel.load('models/tfidf-model') 
testData = load_pipeline_model.transform(test_df) 

In [84]:
testData.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawCommentText: vector (nullable = true)
 |-- features: vector (nullable = true)



## lr

In [85]:
from pyspark.ml.classification import LogisticRegressionModel

load_lr_model = LogisticRegressionModel.load('models/lr-model') 
Predictions = load_lr_model.transform(testData) 

Predictions.filter(Predictions['prediction'] == 1) \
    .select("probability","comment_text","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 50, truncate = 40)

+----------------------------------------+----------------------------------------+----------+
|                             probability|                            comment_text|prediction|
+----------------------------------------+----------------------------------------+----------+
|[0.49946342860910126,0.5005365713908988]|[wikipedia, sux, wikipedia, sux, wiki...|       1.0|
|   [0.498363102227831,0.501636897772169]|[rosie, fat, piece, fucking, filthy, ...|       1.0|
|[0.49768002572002984,0.5023199742799701]|[terrible, entry, adult, wikipedia, a...|       1.0|
|[0.49141204960358054,0.5085879503964195]|[damned, let, white, right, wing, wea...|       1.0|
| [0.4886520608111807,0.5113479391888193]|[kelapstick, penis, kelapstick, penis...|       1.0|
| [0.4865335206377926,0.5134664793622075]|[nguyen, cock, suckernguyen, cock, su...|       1.0|
|[0.47068202775750334,0.5293179722424967]|[soulja, boy, hoe, watch, crank, watc...|       1.0|
|[0.47068202775750334,0.5293179722424967]|[soulja,

## dt

In [86]:
from pyspark.ml.classification import DecisionTreeClassificationModel

load_dt_model = DecisionTreeClassificationModel.load('models/dt-model') 
Predictions = load_dt_model.transform(testData) 

Predictions.filter(Predictions['prediction'] == 1) \
    .select("probability","comment_text","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 50, truncate = 40)

+----------------------------------------+----------------------------------------+----------+
|                             probability|                            comment_text|prediction|
+----------------------------------------+----------------------------------------+----------+
| [0.3333333333333333,0.6666666666666666]|[creation, history, fuck, new, page, ...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[tushar, dam, invited, teahouse, styl...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[list, living, centenarian, hello, st...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[hello, utcursh, hi, thanks, ur, mess...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[obviously, got, concerned, past, pos...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[robot, filter, censored, error, repo...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[help, living, biography, good, day, ...|       1.0|
| [0.3333333333333333,0.6666666666666666]|[dear, w

## nb

In [87]:
from pyspark.ml.classification import NaiveBayesModel

load_nb_model = NaiveBayesModel.load('models/nb-model') 
Predictions = load_nb_model.transform(testData) 

Predictions.filter(Predictions['prediction'] == 1) \
    .select("probability","comment_text","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 50, truncate = 40)

+----------------------------------------+----------------------------------------+----------+
|                             probability|                            comment_text|prediction|
+----------------------------------------+----------------------------------------+----------+
|   [0.499990057660058,0.500009942339942]|[article, europop, word, start, capit...|       1.0|
|[0.49997925978922575,0.5000207402107743]|[please, stop, recreating, silly, non...|       1.0|
| [0.4999438850371558,0.5000561149628442]|                           [wow, forget]|       1.0|
|[0.49975648894497193,0.5002435110550281]|                         [hi, new, site]|       1.0|
| [0.4996578676404087,0.5003421323595914]|                           [obama, joke]|       1.0|
| [0.4994630811108552,0.5005369188891448]|     [hello, fucker, hows, going, today]|       1.0|
| [0.4994304901275791,0.5005695098724209]|[stop, deleting, edits, heshe, transe...|       1.0|
|[0.49932139226049477,0.5006786077395052]|        

In [88]:
Predictions.select(Predictions.probability.cast('string')).take(10)


[Row(probability='[4.515752350195426E-33,1.0]'),
 Row(probability='[0.9999995852939331,4.1470606702043055E-7]'),
 Row(probability='[0.9998817994274025,1.1820057259746486E-4]'),
 Row(probability='[0.9999999998491178,1.5088218002331627E-10]'),
 Row(probability='[0.9805452594517935,0.019454740548206517]'),
 Row(probability='[0.9999999984897419,1.510258120007907E-9]'),
 Row(probability='[0.9999999999775955,2.2404467895044735E-11]'),
 Row(probability='[0.5428896812795294,0.4571103187204707]'),
 Row(probability='[0.9999999999998426,1.5738403745516422E-13]'),
 Row(probability='[0.9999999999999998,1.2091414641713773E-16]')]

In [89]:
Predictions['probability'].cast('string')

Column<b'CAST(probability AS STRING)'>

In [90]:
pred = Predictions.probability
pred

Column<b'probability'>

In [91]:
df = Predictions.toPandas()

In [92]:
df.head(10)

Unnamed: 0,id,comment_text,rawCommentText,features,rawPrediction,probability,prediction
0,00001cee341fdb12,"[yo, bitch, ja, rule, succesful, ever, whats, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 5.182310097035273, 0.0, 0.0, 0...","[-1278.1944218687247, -1203.7166856063957]","[4.515752350195426e-33, 1.0]",1.0
1,0000247867823ef7,"[rfc, title, fine, imo]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-106.55935709326535, -121.25505251845811]","[0.9999995852939331, 4.1470606702043055e-07]",0.0
2,00013b17ad220c46,"[source, zawe, ashton, lapland]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-134.3841967966897, -143.4272061978247]","[0.9998817994274025, 0.00011820057259746486]",0.0
3,00017563c3f7919a,"[look, back, source, information, updated, cor...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-349.727295840228, -372.341817688502]","[0.9999999998491178, 1.5088218002331627e-10]",0.0
4,00017695ad8997eb,"[nt, anonymously, edit, article]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-62.06367842275167, -65.98369645653399]","[0.9805452594517935, 0.019454740548206517]",0.0
5,0001ea8717f6de06,"[thank, understanding, think, highly, would, r...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-173.68890023986253, -193.9998854986779]","[0.9999999984897419, 1.510258120007907e-09]",0.0
6,00024115d4cbde0f,"[please, add, nonsense, wikipedia, edits, cons...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-321.8753557642494, -346.39711648158374]","[0.9999999999775955, 2.2404467895044735e-11]",0.0
7,000247e83dcc1211,"[dear, god, site, horrible]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-113.43397145753094, -113.60595283219268]","[0.5428896812795294, 0.4571103187204707]",0.0
8,00025358d4737918,"[fool, believe, number, correct, number, lie, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1235.419255904756, -1264.8993433827068]","[0.9999999999998426, 1.5738403745516422e-13]",0.0
9,00026d1092fe71cc,"[double, redirects, fixing, double, redirects,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-602.3017340296163, -638.9531849434933]","[0.9999999999999998, 1.2091414641713773e-16]",0.0


In [104]:
type(tuple(df['probability'])[0].toArray()[0])

numpy.float64

In [114]:
tuple(df['probability'])[0].toArray()[0]

4.515752350195426e-33

In [112]:
np.array(tuple(df['probability'])[0:])

array([[0.        , 1.        ],
       [0.99999959, 0.00000041],
       [0.9998818 , 0.0001182 ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.00000468, 0.99999532]])

In [95]:
subm = pd.read_csv('data/sample_submission.csv')

In [96]:
subm.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
