In [1]:
import findspark
findspark.init("/usr/local/spark")
import pyspark
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

In [None]:
import nltk
import nltk
import re
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer("italian") 
nltk.download('punkt')
stop_words = set(stopwords.words('italian'))

file_path = '/root/Desktop/News'


# Routine for cleaning documents
# ---------------------------------
def clean(path):
    with  open(path, 'r') as to_clean:
        one_line = ''
        for string in to_clean.readlines():
            one_line += string.lower()
    to_clean.close()
    
    #Special espressions of html format
    to_replace = ['\\n','\\t','\\r', '\\', '&nbsp'] 
    
    for item in to_replace:
        one_line = one_line.replace(item,' ') 

    cleaned = ' '.join(word for word in one_line.split() if len(word)>1)
    
    # All other special characters
    definitive = re.sub('[^a-zA-Zàéòùè]', ' ', cleaned)
    
    # Just the stopwords remain
    word_tokens = word_tokenize(definitive)
 
    filtered_text = [w for w in word_tokens if not w in stop_words]
    cleaned = ' '.join(word for word in filtered_text if len(word)>1)
    
    with open(path, 'w') as to_clean:
        to_clean.write(cleaned) 
        
    to_clean.close()
    
    return 
# -----------------------------------------

cleaned_text = clean(file_path)

full_cleaned_text = str("")

with open(file_path, 'r') as f:
    for line in f.readlines():
        full_cleaned_text = full_cleaned_text + " " + str(line)
        
tokens = nltk.word_tokenize(full_cleaned_text)

stemmed_text = ""

for token in tokens:
    stemmed_text = stemmed_text + " " + stemmer.stem(token)
    
print(stemmed_text)


In [None]:
import numpy
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re


def term_freq(k, wordlist):
    """
    takes review_id and wordlist and computes the term frequency
    """
    tf = dict()
    for term in wordlist:
        tf[term] = tf.get(term, 0.0) + 1.0
    return k, tf

def idf(n, docfreq):
    """ Compute the IDF """
    return numpy.log10(numpy.reciprocal(docfreq) * n)


def read_doc(line):
    """ Read one line from review file and split it into Multiple lines and convert it into wordlist for each line
        Note: Removed sentences with less than 6 words and words with less than 4 characters """
    lmtz = WordNetLemmatizer()
    sw = stopwords.words('italian')
    review = line.split("\t")
    review_id = review[0]
    sentences = review[5].split(".")
    result = []
    for idx, sent in enumerate(sentences):
        sent_id = review_id + '_' + str(idx)
        sent_len = len(sent.split(" "))
        if 10 < sent_len < 30:
            words = re.findall(r'[a-zA-Z]+', sent)
            words = [lmtz.lemmatize(w.lower()) for w in words if w.lower() not in sw]
            words = [w for w in words if len(w) > 3]
            result.append((sent_id, words))
    return result


# This is used to keep the reivew_id and original sentences from reviews
def read_reviews(line):
    """ Read one line from review file and split it into enumerated review id and review sentences tuple"""
    review = line.split("\t")
    review_id = review[0]
    sentences = review[5].split(".")
    result = []
    for idx, sent in enumerate(sentences):
        sent_id = review_id + '_' + str(idx)
        result.append((sent_id, sent))
    return result

def extract_sentences(VT, reviews, columnheader, k=10, n=5):
    """
    Returns a list of summary from VT matrix
    :param VT: Right Singular Matrix of SVD
    :param reviews: reviews RDD <reviewid, sentence>
    :param columnheader: reivew id
    :param k: no of concepts(rows in VT)
    :param n: no of review per concept
    """
    concepts = []
    # for idxs in numpy.argpartition(VT[:k,:], -n, 1)[:,-n:]:
    for idxs in numpy.fliplr(VT[:k,:].argsort()[:,-n:]):
        keysentences = []
        for idx in idxs:
            keysentences.append(reviews.lookup(columnheader[idx]))
        concepts.append(keysentences)
    return concepts

def extract_keywords(VT, rowheader, k = 10, n = 5):
    concepts = []
    for idxs in numpy.fliplr(VT[:k,:].argsort()[:,-n:]):
        keywords = []
        for idx in idxs:
            keywords.append(rowheader[idx])
        concepts.append(keywords)
    return concepts

In [None]:
import numpy



# Takes individual sentence from each review
documents = sc.textFile(file).flatMap(lambda review: read_doc(review))
reviews = sc.textFile(file).flatMap(lambda review: read_reviews(review))

# Term Frequency
tf = documents.map(lambda (k, wordlist): term_freq(k, wordlist))

# Get the vocabulary of the documents
vocabulary = tf.map(lambda tuple: tuple[1].keys()).reduce(lambda x,y: x + y)
vocabulary = numpy.unique(vocabulary)

def termfreqmatrix(tfdict):
    return [tfdict.get(word, 0) for word in vocabulary]

def docfreqmatrix(tfdict):
    return [ 1.0 if (tfdict.get(word, 0) > 0) else 0. for word in vocabulary]

#Create Doc Frequency vector
dfvector = tf.map(lambda tuple: docfreqmatrix(tuple[1])).reduce(lambda x, y: numpy.array(x) + numpy.array(y))

#Create Term Frequency matrix
tf = tf.map(lambda (rev_id, tfdict): (rev_id, termfreqmatrix(tfdict))).sortByKey()
tfmatrix = tf.values()
columnheader = tf.keys().collect()
rowheader = vocabulary

#----------------------------------------------

# Preparing the matrices(tfidf from tf matrix and idf vector)
tfmatrix = numpy.array(numpy.transpose(tfmatrix.collect()))
idfvector = idf(len(columnheader), dfvector)
idfvector = numpy.array(numpy.transpose(idfvector))
idfvector = numpy.reshape(idfvector, (-1,1))
tfidfMatrix = tfmatrix * idfvector
result = []
reviews.cache()

# Singular Value Decomposition on the tfidf matrix
# Summary Keywords - Abstraction
U, S, VT = numpy.linalg.svd(tfidfMatrix.T, full_matrices=0)
concepts = extract_keywords(VT, rowheader)
for i,concept in enumerate(concepts):
    result.append('[Concept '+str(i+1)+'] :\t'+str(concept))
    print '[Concept '+str(i+1)+'] :\t'+str(concept)

sc.parallelize(result).coalesce(1).saveAsTextFile("output-lsa/")