In [1]:
import os
import numpy
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
import pickle

Les fonctions sont grandement inspire (et ameliore !) du site web ci dessous
https://machinelearningmastery.com/prepare-news-articles-text-summarization/

In [2]:
# load doc into memory
def load_doc(filepath):
    with open(filepath,'rb') as f:
        data = f.read().decode("utf-8")
    return data

In [3]:
#Split the story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

In [4]:
#Punctuation cleaning,
def punctation_cleaning(article):
    toreturn = article.translate(str.maketrans(dict([(x, None) for x in string.punctuation])))
    return toreturn
punctation_cleaning('ab,cd()!')

'abcd'

In [5]:
#Number  cleaning,
def number_cleaning(article):
    toreturn = article.translate(str.maketrans(dict([(str(x), None) for x in range(10)])))
    return toreturn
number_cleaning('ab1 2cd34e5')

'ab cde'

In [6]:
#word stemming
stemmer = SnowballStemmer("english")
def wordstemmer(word):
    return stemmer.stem(word)

wordstemmer('bones')

'bone'

In [7]:
#Stopwords suppression
#stopwords = stopwords.words('english')
def clean_stopwords(article):
    data = article.split(' ')
    toreturn = [word for word in data if word not in stopwords.words('english')]
    return ' '.join(toreturn)

clean_stopwords(''' It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.''')

" It's official: U.S. President Barack Obama wants lawmakers weigh whether use military force Syria."

In [8]:
#Token separation, no capital letters
tokenizer = RegexpTokenizer(r'\w+')
def tokenizer_article(article):
    toreturn = tokenizer.tokenize(article.lower())
    toreturn = [wordstemmer(w) for w in toreturn]
    return ' '.join(toreturn)
tokenizer_article(''' It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.''')

'it s offici u s presid barack obama want lawmak to weigh in on whether to use militari forc in syria'

In [9]:
#List defining the stories to analyze
toexplore =  os.path.join(os.getcwd(), 'cnn', 'stories')
storylist = [x for x in os.listdir(toexplore) if x.endswith('story')]
storylist[:5]

['0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 '0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 '00027e965c8264c35cc1bc55556db388da82b07f.story',
 '0002c17436637c4fe1837c935c04de47adb18e9a.story',
 '0003ad6ef0c37534f80b55b4235108024b407f0b.story']

In [10]:
#le calcul est uniquement fait sur les 500 premieres stories, sinon mon pc n'a pas
#Suffisament de memoire

#Dataset creation and treatment
article_dict = {}
higlight_dict = {}
counter =  0
for filename in storylist[:5000]: #dataset limitation with x first stories
    counter += 1
    filepath = os.path.join(toexplore, filename)
    article = load_doc(filepath)
    story, highlights = split_story(article)
    
    story = punctation_cleaning(story)
    story = clean_stopwords(story)
    story = number_cleaning(story)
    story = tokenizer_article(story)
    article_dict[filename] = story
    
    highlights = ' '.join(highlights)
    highlights = punctation_cleaning(highlights)
    #highlights = clean_stopwords(highlights)
    highlights = number_cleaning(highlights)
    highlights = tokenizer_article(highlights)
    higlight_dict[filename] = highlights
    
    if counter%1000 == 0:
        print('{} / {} done !'.format(counter, len(storylist)))
        #print(story)
        print(highlights)
print('Completed')


1000 / 92579 done !
a backlash pour in against mandela posit eulog detractor accus him of be a communist terrorist racist there is some truth to some of the claim mandela had a close associ with communist and cofound a milit group
2000 / 92579 done !
the white hous creat the presid council on job and competit jeffrey immelt the ceo and chairman of general electr will lead the group the us unemploy rate linger between and
3000 / 92579 done !
egypt govern has been in turmoil sinc a string of disorgan strike has rippl through the economi analyst expect armi chief abdelfattah elsisi to run for presid he would have to resign his militari post to do so
4000 / 92579 done !
fire affect acr in north washo valley at least peopl have been evacu offici say there has been no contain of the fire offici say
5000 / 92579 done !
sufi shrine and western entiti have been target in libya militia group are tri to maintain law and order the govern use milit as hire gun embolden them an analyst say
Completed

In [11]:
#TF-IDF score calculation
tokenize = RegexpTokenizer(r'\w+')
tfidfarticle = TfidfVectorizer()
tfidf_article_results = tfidfarticle.fit_transform(article_dict.values())

tfidfhighlight = TfidfVectorizer()
tfidf_article_highlights = tfidfhighlight.fit_transform(article_dict.values())

In [12]:
#tfidf results
tfidf_article_results.toarray()#.shape


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [13]:
#feature names for articles
tfidfarticle.get_feature_names()[:10]

['aa',
 'aaa',
 'aaaaaaaah',
 'aadvantag',
 'aaj',
 'aajtv',
 'aal',
 'aalesund',
 'aaliyah',
 'aamer']

In [15]:
#data set saving
dataset = {'article_dict':article_dict, 'higlight_dict':higlight_dict,
           'tfidfarticle':tfidfarticle, 'tfidfhighlight':tfidfhighlight,
           'tfidf_article_results':tfidf_article_results, 'tfidf_article_highlights':tfidf_article_highlights}
with open('dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

In [16]:
#Data set loading
def load_obj(name):
    with open('dataset.pkl', 'rb') as f:
        return pickle.load(f)
dataset = load_obj('dataset')

article_dict = dataset['article_dict']
higlight_dict = dataset['higlight_dict']

tfidfarticle = dataset['tfidfarticle']
tfidfhighlight = dataset['tfidfhighlight']

tfidf_article_results = dataset['tfidf_article_results']
tfidf_article_highlights = dataset['tfidf_article_highlights']

In [17]:
#display highlight
higlight_dict['0001d1afc246a7964130f43ae940af6bc6c57f01.story']

'syrian offici obama climb to the top of the tree doesnt know how to get down obama send a letter to the head of the hous and senat obama to seek congression approv on militari action against syria aim is to determin whether cw were use not by whom say un spokesman'

In [18]:
#display article
article_dict['0001d1afc246a7964130f43ae940af6bc6c57f01.story']

'it offici us presid barack obama want lawmak weigh whether use militari forc syria obama sent letter head hous senat saturday night hour announc believ militari action syrian target right step take alleg use chemic weapon the propos legisl obama ask congress approv use militari forc deter disrupt prevent degrad potenti futur use chemic weapon weapon mass destruct it step set turn intern crisi fierc domest polit battl there key question loom debat what un weapon inspector find syria what happen congress vote and syrian govern react in televis address white hous rose garden earlier saturday presid said would take case congress want to while i believ i author carri militari action without specif congression author i know countri stronger take cours action even effect said we debat issu big busi usual obama said top congression leader agre schedul debat bodi return washington septemb the senat foreign relat committe hold hear matter tuesday sen robert menendez said transcript read obama f