In [1]:
import findspark
findspark.init("/usr/local/spark")
import pyspark
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

In [20]:
import nltk
import nltk
import re
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer("italian") 
nltk.download('punkt')
stop_words = set(stopwords.words('italian'))

file_path = '/root/Desktop/News'


# Routine for cleaning documents
# ---------------------------------
def clean(path):
    with  open(path, 'r') as to_clean:
        one_line = ''
        for string in to_clean.readlines():
            one_line += string.lower()
    to_clean.close()
    
    #Special espressions of html format
    to_replace = ['\\n','\\t','\\r', '\\', '&nbsp'] 
    
    for item in to_replace:
        one_line = one_line.replace(item,' ') 

    cleaned = ' '.join(word for word in one_line.split() if len(word)>1)
    
    # All other special characters
    definitive = re.sub('[^a-zA-Zàéòùè]', ' ', cleaned)
    
    # Just the stopwords remain
    word_tokens = word_tokenize(definitive)
 
    filtered_text = [w for w in word_tokens if not w in stop_words]
    cleaned = ' '.join(word for word in filtered_text if len(word)>1)
    
    with open(path, 'w') as to_clean:
        to_clean.write(cleaned) 
        
    to_clean.close()
    
    return 
# -----------------------------------------

cleaned_text = clean(file_path)

full_cleaned_text = str("")

with open(file_path, 'r') as f:
    for line in f.readlines():
        full_cleaned_text = full_cleaned_text + " " + str(line)
        
tokens = nltk.word_tokenize(full_cleaned_text)

stemmed_text = ""

for token in tokens:
    stemmed_text = stemmed_text + " " + stemmer.stem(token)
    
print(stemmed_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
 id objectid fa af titl scuol manc prof matemat vuot circ mil cattedr text emergt insegn matemat scuol italian aver fatt cont utilizz dat minister istruzion lanc allarm poch prof matemat quind cattedr ruol rimang vuot problem avvert soprattutt scuol med quest anno cre voragin post rimast vacant dop trasfer soprattutt nord lombard sol arriv par nemmen prossim immission ruol colm vuot vincitor concors sufficient graduator esaur class matemat scienz secondar prim grad già svuot temp concors orma quas complet tutt ital dispon nomin circ professor iscritt graduator numer insufficient ricopr quas quattromil post liber spieg oltre post assegn quind necessar ricorr supplent mancanz alcun professional scuol frutt svil anni valor docenz dov scars attenzion stat confront sistem istruzion termin invest capac vision cos ministr istruzion valer fedel problem poch laur matemat merc lavor sop

In [21]:
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import StopWordsRemover

data = sc.textFile(file_path).zipWithIndex().map( lambda cleanwords_idd: Row(idd = cleanwords_idd[1], cleanwords = cleanwords_idd[0].split(" ")))
docDF = spark.createDataFrame(data)


Vector = CountVectorizer(inputCol="cleanwords", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select("idd", "vectors").rdd.map(lambda x_y: [x_y[0],Vectors.fromML(x_y[1])]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

weight = ldaModel.describeTopics(maxTermsPerTopic = 10)

wordNumbers = 10  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result

topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()

#print(vocabArray(int(weight[0][0][0])))
i = 0 
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    j = 0
    for term in topics_final[topic]:
        print (term + '  ' + str(weight[i][1][j]))
        j += 1
    i += 1
    print ('\n')

Topic0:
matematica  0.009560573221092942
scuola  0.00808030602233562
posti  0.007174175578550224
vuote  0.00701157776472776
quasi  0.006984416666260308
capacità  0.006773406498557413
istruzione  0.0066294229384543345
ben  0.006600503209852239
aver  0.006556311017571411
reclutano  0.0065142697691635085


Topic1:
frutto  0.00755124839871547
voragine  0.007512074134983625
lombardia  0.007231254609200753
professionalità  0.007015777869361581
fedeli  0.006789730066915294
date  0.006683804983393134
ministra  0.006668940226489626
confronti  0.0066030669225653794
fa  0.00660271421692034
pare  0.006571353930747912


Topic2:
pare  0.006934124243330362
settentrionali  0.006785717924896101
primo  0.006783615605131653
concorsi  0.0067461302177306
tutta  0.006697932883073028
svilimento  0.006627104343063499
scienze  0.0066031510387970435
anni  0.0065838917919853
resume  0.006506316001530461
link  0.0064634905197354185


