In [1]:
import nltk
import os
import pickle
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from PyRouge.pyrouge import Rouge

In [2]:
with open('pickles/test.pickle','rb') as data:
    df = pickle.load(data)

In [3]:
def summarize(triplets,sentences,processed_sentences):
    
    tfidf = TfidfVectorizer(encoding = 'utf-8',
                        stop_words = None,
                        lowercase = False,
                        min_df = 1,
                        max_df = 1.0,
                        norm = 'l2',
                        sublinear_tf = True)
    inp = pd.Series(triplets)
    features = tfidf.fit_transform(inp).todense()
    
    
    n = len(features)//3
    model = KMeans(n_clusters=n, init='k-means++', max_iter=50, n_init=10)
    model.fit(features)
    clusters = model.labels_.tolist()

    sent_dict = {}
    for i,sentence in enumerate(sentences):
        sent_dict[i] = {}
        sent_dict[i]['text'] = sentence
        sent_dict[i]['processed'] = processed_sentences[i]
        sent_dict[i]['cluster'] = clusters[i]

    clus_dict = {}
    for i,value in sent_dict.items():
        if value['cluster'] not in clus_dict:
            clus_dict[value['cluster']] = []
        clus_dict[value['cluster']].append(value['processed'])
        value['idx'] = len(clus_dict[value['cluster']])-1

    max_cos_score = {}
    for i,value in clus_dict.items():
        max_cos_score[i] = {}
        max_cos_score[i]['score'] = 0
        tfidf_matrix = tfidf.fit_transform(value)
        cos_sim_matrix = cosine_similarity(tfidf_matrix)
        for idx,row in enumerate(cos_sim_matrix):
            sum = 0
            for col in row:
                sum += col
            if sum>=max_cos_score[i]['score']:
                max_cos_score[i]['score'] = sum
                max_cos_score[i]['idx'] = idx

    result_index = []
    for i,value in max_cos_score.items():
        cluster = i
        idx = value['idx']
        for key,val in sent_dict.items():
            if val['cluster'] == cluster and val['idx'] == idx:
                result_index.append(key)

    result_index.sort()

    summary = ''
    for idx in result_index:
        summary += sentences[idx] + ' '
        
    return summary

In [4]:
summary_set = []
sentences = df['Sentences'].tolist()
triplets = df['Triplets'].tolist()
processed_sentences = df['Processed'].tolist()
for i in range(len(sentences)):
    summary_set.append(summarize(triplets[i],sentences[i],processed_sentences[i]))

In [5]:
def evaluate(cand_summary,ref_summary):
    rouge = Rouge()
    precision = []
    recall = []
    f_score = []
    for i in range(len(cand_summary)):
        [p,r,f] = rouge.rouge_l(cand_summary[i],ref_summary[i])
        precision.append(p)
        recall.append(r)
        f_score.append(f)
        if i%10==0:
            print(str(i)+"/"+str(len(cand_summary))+" completed")
    
    avg_precision = sum(precision)/len(precision)
    avg_recall = sum(recall)/len(recall)
    avg_fscore = sum(f_score)/len(f_score)
    
    return avg_precision,avg_recall,avg_fscore

In [6]:
precision,recall,f_score = evaluate(summary_set,df['Summary'])
print("Average Precision is :"+str(precision)+"\nAverage Recall is :"+str(recall)+"\nAverage F Score is :"+str(f_score))

0/100 completed
10/100 completed
20/100 completed
30/100 completed
40/100 completed
50/100 completed
60/100 completed
70/100 completed
80/100 completed
90/100 completed
Average Precision is :0.9711070159332471
Average Recall is :0.7145002985323772
Average F Score is :0.8145491700741876


In [13]:
print(df['Content'][17])

Strong demand triggers oil rally.
Crude oil prices surged back above the $47 a barrel mark on Thursday after an energy market watchdog raised its forecasts for global demand.

The International Energy Agency (IEA) warned demand for Opec's crude in the first quarter would outstrip supply. The IEA raised its estimate of 2005 oil demand growth by 80,000 barrels a day to 84 million barrels a day. US light crude rose $1.64 to $47.10, while Brent crude in London gained $1.32 to $44.45.

The Paris-based IEA watchdog, which advises industrialized nations on energy policy, said the upward revision was due to stronger demand from China and other Asian countries. The fresh rally in crude prices followed gains on Wednesday which were triggered by large falls in US crude supplies following a cold spell in North America in January. The US Department of Energy reported that crude stockpiles had fallen 1m barrels to 294.3m. On top of that, ongoing problems for beleaguered Russian oil giant Yukos have 

In [12]:
print(summary_set[17])        #generated summary

Crude oil prices surged back above the $47 a barrel mark on Thursday after an energy market watchdog raised its forecasts for global demand. The International Energy Agency (IEA) warned demand for Opec's crude in the first quarter would outstrip supply. The fresh rally in crude prices followed gains on Wednesday which were triggered by large falls in US crude supplies following a cold spell in North America in January. 


In [11]:
print(df['Summary'][17])       #reference summary from dataset

Crude oil prices surged back above the $47 a barrel mark on Thursday after an energy market watchdog raised its forecasts for global demand.The US Department of Energy reported that crude stockpiles had fallen 1m barrels to 294.3m.The International Energy Agency (IEA) warned demand for Opec's crude in the first quarter would outstrip supply.The IEA raised its estimate of 2005 oil demand growth by 80,000 barrels a day to 84 million barrels a day.


In [None]:
# Ignore the below cells (Failed DBSCAN)

ep = 1.2
min_samples = 3

dbscan = DBSCAN(eps=ep,min_samples=min_samples)

db = dbscan.fit(features)

db.labels_