In [1]:

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


conf = SparkConf()
conf.setMaster("local[*]").setAppName("CENG790-Project")
conf.set("spark.driver.memory", "15g")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

22/01/15 23:44:03 WARN Utils: Your hostname, bhdemirbilek resolves to a loopback address: 127.0.1.1; using 10.1.46.97 instead (on interface eno1)
22/01/15 23:44:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/01/15 23:44:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:

# # Saving the oscar dataset(28GB) to json format. (only execute once)
# from datasets import load_from_disk
# dataset = load_from_disk("lang_detected")["train"]
# # Set num_proc according to your cpu count, num_proc=20 means 20 thread will be executed paralelly.
# dataset.to_json("dataset_json", num_proc=20)

In [3]:
# Reads from the dataset_json.json file, 
df_json = spark.read.json("dataset.json")

                                                                                

In [5]:
small_df = df_json.limit(10000)
small_df.show(10)
small_df.write.mode("overwrite").json("dataset_small.json")



                                                                                

+---+----+--------------------+
| id|lang|                text|
+---+----+--------------------+
|  0|  tr|Son yıllarda görü...|
|  1|  tr|Şehrin karmaşası ...|
|  2|  tr|2010 Yılında Mard...|
|  3|  tr|29Ekim 2009 2010 ...|
|  4|  tr|Yüksek İslam Şura...|
|  5|  tr|Oncelıkle bu etkı...|
|  6|  tr|Mavi-Mi Sanat Mer...|
|  7|  tr|Türkiye Futbol Fe...|
|  8|  tr|anlami-nedir.com'...|
|  9|  tr|Kepez Belediye Ba...|
+---+----+--------------------+
only showing top 10 rows





In [None]:
small_df2 = spark.read.json("dataset_small.json").select("lang", "text")
#small_df2.show(10)

In [None]:

tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(small_df2)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledDataWithLang = idfModel.transform(featurizedData)


In [None]:
rescaledDataWithLang

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="lang", outputCol="label")
rescaledData = indexer.fit(rescaledDataWithLang).transform(rescaledDataWithLang)


In [None]:
print(rescaledData.count())
rescaledData


In [None]:
rescaledData.select("features", "label").write.mode("overwrite").json("dataset_small_rescaled.json")

In [None]:
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField, DoubleType

schema = StructType([StructField('features', VectorUDT(),False), StructField('label', DoubleType(),False)])

rescaledData = spark.read.schema(schema=schema).json("dataset_small_rescaled.json")
rescaledData.show(10)

In [None]:
rescaledData.select("label", "features").show(1)
rescaledData

In [None]:
(trainingData, testData) = rescaledData.select("label", "features").randomSplit([0.8, 0.2])

In [None]:
trainingData

In [None]:
training = spark \
    .read \
    .format("libsvm") \
    .load("sample_multiclass_classification_data.txt")
training

In [None]:

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)



In [None]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))