In [2]:
import re
import numpy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [5]:
import pandas as pd
data=pd.read_csv("scisumm.csv"
                 ,nrows=201)
data.head()

Unnamed: 0,text,summary
0,TnT - A Statistical Part-Of-Speech Tagger Trig...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
1,Mildly Non-Projective Dependency Structures Sy...,Mildly Non-Projective Dependency Structures\nS...
2,Using Corpus Statistics And WordNet Relations ...,Using Corpus Statistics And WordNet Relations ...
3,Automatic Labeling Of Semantic Roles present a...,Automatic Labeling Of Semantic Roles\nWe prese...
4,Generative Models For Statistical Parsing With...,Generative Models For Statistical Parsing With...


In [6]:
stop_words = set(stopwords.words('english'))
def TextCleaning(text,num):
    text = text.lower()
    #cleantext = re.sub("\(.*?\)", '', text)
    cleantext = re.sub("[0-9]", '', text)
    cleantext = re.sub("(\.\.+)", ' ',cleantext)
    cleantext = re.sub("(--+)", ' ',cleantext)
    cleantext = re.sub("(~~+)", ' ',cleantext)
    cleantext = re.sub("[<>()|&©ø\[\]\'\";~*]", ' ',cleantext)
    cleantext = re.sub("(\+\++)", ' ',cleantext)
    cleantext = re.sub("(__+)", ' ',cleantext)
    cleantext = re.sub("e.g.", '',cleantext)
    cleantext = re.sub("i.e.,", '',cleantext)
    cleantext = re.sub("acc.", '',cleantext)
    #cleantext = re.sub("[^a-zA-Z]", ' ',cleantext)
    cleantext = re.sub("(\s+)",' ',cleantext)
    if(num==0):
        tokens = [w for w in cleantext.split() if not w in stop_words]
    else:
        tokens=cleantext.split()
    long_words=[]
    for i in tokens:
        if len(i)>2:                                              
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [7]:
cleaned_text = []
for t in data['text']:
    cleaned_text.append(TextCleaning(t,2)) 
#call the function
cleaned_summary = []
for t in data['summary']:
    cleaned_summary.append(TextCleaning(t,0))
data['cleaned_text']=cleaned_text
data['cleaned_summary']=cleaned_summary
data.dropna(axis=0,inplace=True)
data.iloc[[0],[2]]

Unnamed: 0,cleaned_text
0,tnt statistical part-of-speech tagger trigrams...


In [38]:
# define data set and parameters
raw_data = data['cleaned_text'][0]
ps = PorterStemmer()
nltk_stop_words = set(stopwords.words('english'))
cluster_count = 8

sentences = sent_tokenize(raw_data)
processedSentences = sentences

In [102]:
# create tfidf matrix from the processed sentences
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processedSentences)

# cluster our tokenized sentences into 10 groups
kMeansCluster = KMeans(n_clusters=cluster_count)
kMeansCluster.fit(tfidf_matrix)
kmeansmodel = kMeansCluster.fit(tfidf_matrix)
clusters = kMeansCluster.labels_.tolist()


sentenceDictionary = {}
for idx, sentence in enumerate(sentences):
    sentenceDictionary[idx] = {}

    sentenceDictionary[idx]['cluster'] = clusters[idx]
    sentenceDictionary[idx]['stemmed'] = processedSentences[idx]

# Create new dictionary that contains 1 entry for each cluster
# each key in dictionary will point to array of sentences, all of which belong to that cluster
# we attach the index to the sentenceDictionary object so we can recall the original sentence
clusterDictionary = {}
for key, sentence in sentenceDictionary.items():
    if sentence['cluster'] not in clusterDictionary:
        clusterDictionary[sentence['cluster']] = []
    clusterDictionary[sentence['cluster']].append(sentence['stemmed'])
    sentence['idx'] = len(clusterDictionary[sentence['cluster']]) - 1


# For each cluster of sentences,
# Find the sentence with highet cosine similarity over all sentences in cluster
maxCosineScores = {}
for key, clusterSentences in clusterDictionary.items():
    maxCosineScores[key] = {}
    maxCosineScores[key]['score'] = 0
    tfidf_matrix = vectorizer.fit_transform(clusterSentences)
    cos_sim_matrix = cosine_similarity(tfidf_matrix)
    for idx, row in enumerate(cos_sim_matrix):
        sum = 0
        for col in row:
            sum += col
        if sum > maxCosineScores[key]['score']:
            maxCosineScores[key]['score'] = sum
            maxCosineScores[key]['idx'] = idx

# for every cluster's max cosine score,
# find the corresponding original sentence
resultIndices = []
i = 0
for key, value in maxCosineScores.items():
    cluster = key
    idx = value['idx']
    stemmedSentence = clusterDictionary[cluster][idx]
# key corresponds to the sentences index of the original document
# we will use this key to sort our results in order of original document
    for key, value in sentenceDictionary.items():
        if value['cluster'] == cluster and value['idx'] == idx:
            resultIndices.append(key)

resultIndices.sort()

# Iterate over sentences and construct summary output
result = ''
for idx in resultIndices:
    result += sentences[idx] + ' '

print(result)

the suffix strong predictor for word classes, words the wall street journal part the penn treebank ending able are adjectives the cases fashionable, variable the rest are nouns cable, variable the probability distribution for particular suffix generated from all words the training set that share the same suffix some predefined maximum lh. use context-independent approach for did for the contextual wts turned out good choice set all the standard deviation the unconditioned maximum likelihood probabilities the tags the training corpus, set for all using tagset tags and the average this usually yields values the range use different estimates for uppercase and lowercase words, maintain two different suffix tries depending the capitalization the word. the tests are performed partitions the corpora that use training set and test set, that the test data guaranteed unseen during training. tagging racies for the negra corpus are shown table figure shows the learning curve the tagger, the racy d

In [30]:
from rouge_score import rouge_scorer

reference = data.loc[0,'cleaned_summary']

hypotheses = str(result)
reference = str(reference)

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(hypotheses,
                      reference)
print(hypotheses)
print("____________________")
print(reference)
print(scores)

additionally, present results the tagger the negra corpus brants al., and the penn treebank marcus al., the penn treebank results reported here for the markov model approach are least equivalent those reported for the maximum entropy approach ratnaparkhi, for comparison other taggers, the reader referred zavrel and daelemans, tnt uses second order markov models for part-ofspeech tagging. define maximum likelihood probability zero the corresponding nominators and denominators are zero. use the context-independent variant linear interpolation, the values the not depend the particular trigram. the suffix strong predictor for word classes, words the wall street journal part the penn treebank ending able are adjectives the cases fashionable, variable the rest are nouns cable, variable the probability distribution for particular suffix generated from all words the training set that share the same suffix some predefined maximum lh. training the number tokens used for training. exploit the fac

In [32]:
allsum4 = []
for text in data['cleaned_text']:
    raw_data = text
    ps = PorterStemmer()
    nltk_stop_words = set(stopwords.words('english'))
    cluster_count = 3
    min_sentence_length = 30
    sentences = sent_tokenize(raw_data)
    processedSentences = sentences
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processedSentences)


    kMeansCluster = KMeans(n_clusters=cluster_count)
    kMeansCluster.fit(tfidf_matrix)
    clusters = kMeansCluster.labels_.tolist()


    sentenceDictionary = {}
    for idx, sentence in enumerate(sentences):
        sentenceDictionary[idx] = {}
        sentenceDictionary[idx]['text'] = sentence
        sentenceDictionary[idx]['cluster'] = clusters[idx]
        sentenceDictionary[idx]['stemmed'] = processedSentences[idx]
        
    clusterDictionary = {}
    for key, sentence in sentenceDictionary.items():
        if sentence['cluster'] not in clusterDictionary:
            clusterDictionary[sentence['cluster']] = []
        clusterDictionary[sentence['cluster']].append(sentence['stemmed'])
        sentence['idx'] = len(clusterDictionary[sentence['cluster']]) - 1
        
    maxCosineScores = {}
    for key, clusterSentences in clusterDictionary.items():
        maxCosineScores[key] = {}
        maxCosineScores[key]['score'] = 0
        tfidf_matrix = vectorizer.fit_transform(clusterSentences)
        cos_sim_matrix = cosine_similarity(tfidf_matrix)
        for idx, row in enumerate(cos_sim_matrix):
            sum = 0
            for col in row:
                sum += col
            if sum > maxCosineScores[key]['score']:
                maxCosineScores[key]['score'] = sum
                maxCosineScores[key]['idx'] = idx

    resultIndices = []
    i = 0
    for key, value in maxCosineScores.items():
        cluster = key
        idx = value['idx']
        stemmedSentence = clusterDictionary[cluster][idx]
        for key, value in sentenceDictionary.items():
            if value['cluster'] == cluster and value['idx'] == idx:
                resultIndices.append(key)

    resultIndices.sort()
    
    result = ''
    for idx in resultIndices:
        result += sentences[idx] + ' '
    allsum4.append(result)

In [33]:
data["result"] = allsum4

In [34]:
from rouge_score import rouge_scorer

reference = data['cleaned_summary']
hypotheses = data["result"]

hypotheses = str(hypotheses)
reference = str(reference)

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(hypotheses,
                      reference)
print(hypotheses)
print("____________________")
print(reference)
print(scores)

0      first step, use the maximum likelihood probabi...
1      section provide experimental evaluation the no...
2      one should not conclude from these results tha...
3      given input sentence and target word frame, th...
4      sult, the categorial lexicon extracted for thi...
                             ...                        
196    explicitly representing these two steps indepe...
197    limitations co-training for natural language l...
198    while this result encouraging, seems that the ...
199    known that holds and only the function satisfi...
200    reordering approaches havven significant impro...
Name: result, Length: 201, dtype: object
____________________
0      tnt statistical part-of-speech tagger trigrams...
1      mildly non-projective dependency structures sy...
2      using corpus statistics wordnet relations sens...
3      automatic labeling semantic roles present syst...
4      generative models statistical parsing combinat...
                          