# Network Analysis for Information Retrieval - Part 1

Elyes KHALFALLAH & Mohammed ali EL ADLOUNI

16/03/2025

---

---


## 0. Etapes préliminaires


In [None]:
# Useful libraries
import nltk
import re

import numpy as np
import pandas as pd
from fonctions import *

# Downloading the necessary datasets for the nltk library
# Only download if necessary
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)


In [None]:
# Import data from data_project.csv
data = pd.read_csv("data_project.csv", sep="\t")

# Show the first 5 rows of the data
data.head()


In [None]:
data.shape


In [None]:
data["class"].unique()


---

## 1. Prétraitement des données

### 1.1. Générer le texte sur lequel nous travaillerons


In [None]:
# Fill NaN values in 'abstract' and 'title' with empty strings and create 'text'
data_text = data.fillna({"abstract": "", "title": ""})

# Concatenate 'title' and 'abstract' into 'text'
data_text["text"] = " " + data_text["title"] + " " + data_text["abstract"]

# Reposition 'text' as the first column
cols = ["text"] + [col for col in data_text.columns if col != "text"]
data_text = data_text[cols]

# Show the first 5 rows of the cleaned data
data_text.head()


In [None]:
# Show 'text' column of the first row
print(data_text["text"][0])


### 1.2. Prétrairements poussés


Now, we want to build an index for the data. To do so, we'll :

1. Remove punctuation
2. Lowercase everything
3. Remove useless spaces
4. Remove stop-words
5. Normalize the data (lemmatization)
6. Remove outliers


#### Punctuation removal :


In [None]:
# Remove all puntuation from 'text' using regex
data_text["text"] = data_text["text"].apply(lambda x: re.sub(r"[^\w\s]", " ", x))

# Show 'text' column of the first row
print(data_text["text"][0])


#### Lowercase :


In [None]:
# Convert 'text' to lowercase
data_text["text"] = data_text["text"].str.lower()

# Show 'text' column of the first row
print(data_text["text"][0])


#### Spaces :


In [None]:
# Remove useless spaces using regex
data_text["text"] = data_text["text"].apply(lambda x: re.sub(r"\s+", " ", x).strip())

# Show 'text' column of the first row
print(data_text["text"][0])


Lets see what we have for now


In [None]:
# Plot the most common words in the 'text' column of data_text
_ = word_occurrences(data_text, visualisation=True)


#### Stop-words :


In [None]:
# Remove stop words from 'text' column in data_text
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
data_text["text"] = data_text["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)


In [None]:
# Plot the most common words in the 'text' column of data_text
_ = word_occurrences(data_text, visualisation=True)


#### Normalize data (lemming) :


In [None]:
# Imports for lemmatization
from nltk.stem import WordNetLemmatizer

# Define lemmatizer tool
lemmatizer = WordNetLemmatizer()

# Show the first row to verify the lemmatization
print("Before :\n\t", data_text["text"][0])

# Apply lemmatization to each word in the text
data_text["text"] = data_text["text"].apply(
    lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()])
)

# Show the first row to verify the lemmatization
print("After :\n\t", data_text["text"][0])


In [None]:
# Plot the most common words after lemmatization
_ = word_occurrences(data_text, visualisation=True)


#### Remove outliers :


In [None]:
# See if there are outliers in occurrences
plt.figure(figsize=(12, 8))
sns.boxplot(list(word_occurrences(data_text).values()), palette="rainbow_r")
plt.xlabel("Word Counts")
plt.title("Boxplot of Word Counts")
plt.show()


In [None]:
occurrences = word_occurrences(data_text)

# Define limits as 10th and 90th percentiles
min_occurrences = pd.Series(occurrences).quantile(0.825)
max_occurrences = pd.Series(occurrences).quantile(0.955)

# momo test
min_occurrences = pd.Series(occurrences).quantile(0.825)
max_occurrences = pd.Series(occurrences).quantile(0.975)

print("Min occurrences:", min_occurrences)
print("Max occurrences:", max_occurrences)

# Identify words that have more than max_occurrences occurrences
high_outliers = {
    word: count for word, count in occurrences.items() if count > max_occurrences
}

# Identify words that have less than min_occurrences occurrences
low_outliers = {
    word: count for word, count in occurrences.items() if count < min_occurrences
}

# print("High outliers:", high_outliers)
# print("Low outliers:", low_outliers)

# Remove words that are in high_outliers and low_outliers from 'text' column in data_text
data_text["text"] = data_text["text"].apply(
    lambda x: " ".join(
        [
            word
            for word in x.split()
            if word not in high_outliers and word not in low_outliers
        ]
    )
)


In [None]:
before = len(occurrences)
after = len(word_occurrences(data_text))
difference = before - after

print("Number of words before removing outliers :", before)
print("Number of words after removing outliers  :", after)
print("Difference (amount of words removed)     :", difference)


In [None]:
# See if there are outliers in occurrences
plt.figure(figsize=(12, 8))
sns.boxplot(list(word_occurrences(data_text).values()), palette="rainbow_r")
plt.xlabel("Word Counts")
plt.title("Boxplot of Word Counts")
plt.show()


In [None]:
clean_word_counts = word_occurrences(data_text, visualisation=True)


In [None]:
# save data_text variable as a CSV file
data_text.to_csv("data_text.csv", index=False)


---

## 2. Mise en place du moteur de recherche


In [None]:
# Nous allons maintenant construire la matrice Documents x Termes en adoprant le schéma de pondération TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the vectorizer
tf_vectorizer = TfidfVectorizer(use_idf=False)
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on the 'text' column
X_tf = tf_vectorizer.fit_transform(data_text["text"])
X_tfidf = tfidf_vectorizer.fit_transform(data_text["text"])

# Show the shape of the resulting matrix
X_tfidf.shape


In [None]:
requete = ["representation", "learning", "for", "natural", "language", "processing"]

pseudo_document_tf = tf_vectorizer.transform([" ".join(requete)])
pseudo_document_tfidf = tfidf_vectorizer.transform([" ".join(requete)])

# Calculer les similarités cosinus entre le pseudo-document et les documents
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

similarities_tf = cosine_similarity(X_tf, pseudo_document_tf)
similarities_tfidf = cosine_similarity(X_tfidf, pseudo_document_tfidf)

# Afficher les 5 documents les plus similaires
top5_tf = np.argsort(similarities_tf.flatten())[::-1][:5]
top5_tfidf = np.argsort(similarities_tfidf.flatten())[::-1][:5]

print(f"TF    : {top5_tf} | {similarities_tf[top5_tf].flatten()}")
print(
    data_text.iloc[top5_tf].drop(columns=["text", "abstract", "references"]), "\n\n\n\n"
)

print(f"TFIDF : {top5_tfidf} | {similarities_tfidf[top5_tfidf].flatten()}")
print(data_text.iloc[top5_tfidf].drop(columns=["text", "abstract", "references"]))


# print(data_text.iloc[top5_tf]["title"], '\n\n\n\n')
# print(data_text.iloc[top5_tfidf]["title"])


In [None]:
requete = ["representation", "learning", "for", "natural", "language", "processing"]

pseudo_document_tf = tf_vectorizer.transform([" ".join(requete)])
pseudo_document_tfidf = tfidf_vectorizer.transform([" ".join(requete)])

# Calculer les similarités cosinus entre le pseudo-document et les documents
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

similarities_tf = cosine_similarity(X_tf, pseudo_document_tf)
similarities_tfidf = cosine_similarity(X_tfidf, pseudo_document_tfidf)

# Afficher les 5 documents les plus similaires
top5_tf = np.argsort(similarities_tf.flatten())[::-1][:5]
top5_tfidf = np.argsort(similarities_tfidf.flatten())[::-1][:5]

print(f"TF    : {top5_tf} | {similarities_tf[top5_tf].flatten()}")
print(
    data_text.iloc[top5_tf].drop(columns=["text", "abstract", "references"]), "\n\n\n\n"
)

print(f"TFIDF : {top5_tfidf} | {similarities_tfidf[top5_tfidf].flatten()}")
print(data_text.iloc[top5_tfidf].drop(columns=["text", "abstract", "references"]))


# print(data_text.iloc[top5_tf]["title"], '\n\n\n\n')
# print(data_text.iloc[top5_tfidf]["title"])


In [None]:
# ecrire meme code que cellule avant just change le score de similarité a distance euclidienne
similarities_tf = euclidean_distances(X_tf, pseudo_document_tf)
similarities_tfidf = euclidean_distances(X_tfidf, pseudo_document_tfidf)

# Afficher les 5 documents les plus similaires
top5_tf = np.argsort(similarities_tf.flatten())[::-1][:5]
top5_tfidf = np.argsort(similarities_tfidf.flatten())[::-1][:5]

print(f"TF    : {top5_tf} | {similarities_tf[top5_tf].flatten()}")
print(
    data_text.iloc[top5_tf].drop(columns=["text", "abstract", "references"]), "\n\n\n\n"
)

print(f"TFIDF : {top5_tfidf} | {similarities_tfidf[top5_tfidf].flatten()}")
print(data_text.iloc[top5_tfidf].drop(columns=["text", "abstract", "references"]))


Talk about :

- TF requiring stopwords removed, and TDIDF requiring stop words not removed
- Euclidian distances donc function in higher dimensions (if you do TF and TFIDF scores with euclidian distances, you'll get the exact same score every time (curse of high dimensionnality))
- Don't remove stopwords ?


In [None]:
# save X_tf and X_tfidf as numpy matrix
np.save("X_tf.npy", X_tf)
np.save("X_tfidf.npy", X_tfidf)


In [None]:
X_tf.shape
