### Activité: Effectuez un nettoyage et une analyse exploratoire des données texte

In [2]:
import os
import string

In [3]:
import nltk
#nltk.download('all')

#### Décompression des archives cnn.tgz et cnn_stories.tgz (pas obligatoire à partir du notebook)

In [2]:
from shutil import unpack_archive
unpack_archive('C:\\Formation\Data scientist\\Projet_6\\cnn.tgz', 'C:\\Formation\Data scientist\\Projet_6')

In [None]:
from shutil import unpack_archive
unpack_from shutil import unpack_archive
unpack_archive('C:\\Formation\Data scientist\\Projet_6\\cnn_stories.tgz', 'C:\\Formation\Data scientist\\Projet_6')

### Fonctions

#### Chargement en mémoire du contenu d'un fichier 

In [6]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

#### Chargement de tous les fichiers d'un répertoire et enregistrement de leur contenu dans un dictionnaire {highlight, story}

In [7]:
# load all stories in a directory
def load_stories(directory):
    stories = dict()
    for name in os.listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        #stories.append({'story':story, 'highlights':highlights})
        stories[highlights] = story
    return stories

#### Split a document into news story and highlights

In [8]:
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    # concatenate elements in only one item in the list
    highlights = ' '.join(highlights)
    
    return story, highlights

#### Data cleaning for deleting punctuation and the first line (title with '(CNN) -- ') if exists

In [9]:
# clean the punctuation from a text with a string representation
def clean_punkt(text):
    # strip source cnn office if it exists
    index = text.find('(CNN) -- ')
    if index > -1:
        text = text[index+len('(CNN)'):]
    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)
    return text    

#### Tokenize function for the TfIdfVectorizer instance

In [10]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    # stems = stem_tokens(tokens, stemmer)
    return tokens

### Activité: Préparation des données pour entrainement d'un modèle (nettoyage + analyse)

#### Chargement des articles CNN (stories + highlights) dans un dictionnaire {highlight, story}

In [13]:
stories_dir = 'C:\Formation\Data scientist\Projet_6\TP\\cnn\stories\sample'
stories = load_stories(stories_dir)
print('Loaded CNN Stories %d' % len(stories))

Loaded CNN Stories 3


#### 1er nettoyage au niveau de la ponctuation (utilisation de NLTK)

In [14]:
stories_punkt_cleaned = dict()
for highlight, story in stories.items():
    stories_punkt_cleaned[clean_punkt(highlight)] = clean_punkt(story)
stories_punkt_cleaned.items()

dict_items([('The 15 new cardinals will be installed on February 14 They come from countries such as Myanmar and Tonga No Americans made the list this time or the previous time in Francis papacy', 'CNNFor the second time during his papacy Pope Francis has announced a new group of bishops and archbishops set to become cardinals  and they come from all over the world\n\nPope Francis said Sunday that he would hold a meeting of cardinals on February 14 during which I will name 15 new Cardinals who coming from 13 countries from every continent manifest the indissoluble links between the Church of Rome and the particular Churches present in the world according to Vatican Radio\n\nNew cardinals are always important because they set the tone in the church and also elect the next pope CNN Senior Vatican Analyst John L Allen said They are sometimes referred to as the princes of the Catholic Church\n\nThe new cardinals come from countries such as Ethiopia New Zealand and Myanmar\n\nThis is a pope

In [15]:
# Test d'une instance de CountVectorizer
#from sklearn.feature_extraction.text import CountVectorizer
#count_vect = CountVectorizer()
#test_counts = count_vect.fit_transform([text])
#test_counts.data

#### Préparation des stop words à exclure du vocabulaire pour la fonction TfIdfVectorizer

In [16]:
sw = set()
#sw.update(stopwords)
sw.update(tuple(nltk.corpus.stopwords.words('english')))

#### Calcul des fréquences et tf-idf des deux types de document (highlight+story)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=sw, lowercase=True)

tfidf_highlight_doc = tfidf.fit_transform(stories_punkt_cleaned.keys())
print('Shape - tfidf_highlight_doc:', tfidf_highlight_doc.shape)

tfidf_stories_doc = tfidf.fit_transform(stories_punkt_cleaned.values())
print('Shape - tfidf_stories_doc:', tfidf_stories_doc.shape)

Shape - tfidf_highlight_doc: (3, 79)
Shape - tfidf_stories_doc: (3, 618)


In [22]:
# Visualisation de la matrice tf-idf obtenue pour le docs de type highlight
tfidf_highlight_doc.toarray()

array([[ 0.        ,  0.22847492,  0.17376106,  0.        ,  0.        ,
         0.        ,  0.22847492,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.22847492,  0.        ,  0.        ,  0.        ,  0.22847492,
         0.        ,  0.22847492,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.22847492,  0.22847492,
         0.        ,  0.        ,  0.        ,  0.        ,  0.22847492,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.22847492,  0.        ,  0.        ,  0.22847492,  0.        ,
         0.        ,  0.        ,  0.        ,  0.22847492,  0.        ,
         0.        ,  0.        ,  0.17376106,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.22847492,  0.        ,
         0.        ,  0.        ,  0.        ,  0.22847492,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

#### Visualisation du vocabulaire retenu par l'algorithme TfIdfVectorizer

In [19]:
tfidf.get_feature_names()
#tfidf.vocabulary_

{'10': 0,
 '1000': 1,
 '100000': 2,
 '13': 3,
 '14': 4,
 '15': 5,
 '15month': 6,
 '15th': 7,
 '1600': 8,
 '17': 9,
 '19': 10,
 '1920s': 11,
 '1960s': 12,
 '1994': 13,
 '20': 14,
 '200': 15,
 '20000': 16,
 '2001': 17,
 '2002': 18,
 '2006': 19,
 '25000': 20,
 '29yearold': 21,
 '300': 22,
 '373': 23,
 '4600': 24,
 '4yearold': 25,
 '540': 26,
 '600': 27,
 '7400': 28,
 '900': 29,
 'accept': 30,
 'accompanied': 31,
 'according': 32,
 'across': 33,
 'act': 34,
 'action': 35,
 'activities': 36,
 'actual': 37,
 'added': 38,
 'adding': 39,
 'addition': 40,
 'administration': 41,
 'administrator': 42,
 'adult': 43,
 'afghanistan': 44,
 'africa': 45,
 'age': 46,
 'agree': 47,
 'agreed': 48,
 'agreement': 49,
 'aid': 50,
 'albania': 51,
 'algeria': 52,
 'alleges': 53,
 'allen': 54,
 'alone': 55,
 'already': 56,
 'also': 57,
 'alternative': 58,
 'always': 59,
 'american': 60,
 'americans': 61,
 'among': 62,
 'analyst': 63,
 'animal': 64,
 'animals': 65,
 'announced': 66,
 'another': 67,
 'anyone': 6

#### Amélioration: supprimer les tokens contenant des caractères numériques, inclure les highlights dans les stories et renumériser les documents avec un TfidfVectorizer