In [1]:
!pip install -q nltk numpy pandas networkx

In [2]:
import pandas as pd
import nltk
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import re

In [3]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
url = "https://github.com/devtlv/Datasets-GEN-AI-Bootcamp/raw/refs/heads/main/Week%207/W7D5/tennis_articles.zip"
import pandas as pd

url = "https://github.com/devtlv/Datasets-GEN-AI-Bootcamp/raw/refs/heads/main/Week%207/W7D5/tennis_articles.zip"
# Explicitly specify the encoding as 'latin-1' or 'Windows-1252'
df = pd.read_csv(url, encoding='latin-1') # or try encoding='Windows-1252' if 'latin-1' doesn't work

In [6]:
print(df.head())

   article_id                                      article_title  \
0           1  I do not have friends in tennis, says Maria Sh...   
1           2  Federer defeats Medvedev to advance to 14th Sw...   
2           3  Tennis: Roger Federer ignored deadline set by ...   
3           4  Nishikori to face off against Anderson in Vien...   
4           5  Roger Federer has made this huge change to ten...   

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP)  Roger Federer advanc...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                              source  
0  https://www.tennisworldusa.org/tennis/news/Mar...  
1  http://www.tennis.com/pro-game/2018/10/copil-s...  
2  https://scroll.in/field/899938/tennis-roger-fe...  
3  http://www.tennis.com/pro-game/

In [7]:
df.drop(columns=["article_title"], inplace=True)

In [8]:
print(f"Nombre d'articles : {len(df)}")

Nombre d'articles : 8


In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Suppression des chiffres
    text = text.translate(str.maketrans("", "", string.punctuation))  # Suppression de la ponctuation
    words = text.split()
    words = [word for word in words if word not in stopwords.words("english")]  # Suppression des stop words
    return " ".join(words)


In [11]:
import nltk
nltk.download('punkt_tab') # Download the punkt_tab data

sentences = []
for article in df["article_text"]:
    sentences.extend(sent_tokenize(article))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [12]:
clean_sentences = [clean_text(sentence) for sentence in sentences]


In [13]:
print(f"Nombre de phrases : {len(clean_sentences)}")
print(clean_sentences[:5])

Nombre de phrases : 130
['maria sharapova basically friends tennis players wta tour', 'russian player problems openly speaking recent interview said dont really hide feelings much', 'think everyone knows job', 'im courts im court playing im competitor want beat every single person whether theyre locker room across net', 'im one strike conversation weather know next minutes go try win tennis match']


In [14]:
# Téléchargement et chargement des embeddings GloVe
!wget -q https://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip
glove_path = "glove.6B.100d.txt"

# Création d'un dictionnaire de mots et leurs vecteurs
embeddings_index = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

print(f"Nombre de mots chargés : {len(embeddings_index)}")

# Fonction pour vectoriser une phrase en moyenne des embeddings de ses mots
def sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

# Vectorisation des phrases
sentence_vectors = np.array([sentence_vector(sentence) for sentence in clean_sentences])

# Vérification
print(f"Taille de la matrice des phrases : {sentence_vectors.shape}")


Nombre de mots chargés : 400000
Taille de la matrice des phrases : (130, 100)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity


similarity_matrix = cosine_similarity(sentence_vectors)


graph = nx.from_numpy_array(similarity_matrix)


print(f"Nombre de nœuds : {graph.number_of_nodes()}, Nombre d'arêtes : {graph.number_of_edges()}")


Nombre de nœuds : 130, Nombre d'arêtes : 8385


In [16]:

scores = nx.pagerank(graph)

# Classement des phrases selon leur score
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# Sélection des 5 phrases les plus importantes
summary = " ".join([ranked_sentences[i][1] for i in range(5)])


print("Résumé généré :")
print(summary)


Résumé généré :
I was on a nice trajectorythen, Reid recalled.If I hadnt got sick, I think I could have started pushing towards the second week at the slams and then who knows. Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps. Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I just felt like it really kind of changed where people were a little bit, definitely in the '90s, a lot more quiet, into themselves, and then it started to become better. Meanwhile, Federer is hoping he can improve his service game as he hunts his ninth Swiss Indoors title this week. I felt like the best weeks that I had to get to know players when I was playin