In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

## Βήμα 1: Συλλογή Δεδομένων
Τα δεδομένα που χρησιμοποιήθηκαν για την υλοποίηση της εργασίας είναι βρίσκονται στον εξής σύνδεσμο: https://www.kaggle.com/datasets/sameersmahajan/people-wikipedia-data

In [None]:
df = pd.read_csv('people_wiki.csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Βήμα 2. Προεπεξεργασία κειμένου (Text Processing):

In [None]:
def tokenize(text):
    return text.split()

def remove_punctuation(words):
    cleaned_words = []
    for word in words:
        if word not in string.punctuation:
            cleaned_words.append(word)
    return cleaned_words

def filter_stop_words(words):
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

def preprocess_text(text):
    tokens = tokenize(text)                      # Step 1: Tokenization
    tokens = remove_punctuation(tokens)          # Step 2: Remove Punctuation
    tokens = filter_stop_words(tokens)           # Step 3: Remove Stop Words
    lemmatized_tokens = lemmatize_words(tokens)  # Step 4: Lemmatization
    return lemmatized_tokens

# Apply preprocessing to each text entry
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the processed DataFrame
print(df[['URI', 'name', 'processed_text']].head())

## Βήμα 3: Ευρετήριο (Indexing)

In [None]:
corpus = {}

# test for the first corpus (see lab example)
corpus['uri0'] = {tok: df['processed_text'][0].count(tok) for tok in df['processed_text'][0]}
print(corpus['uri0'])

# Do the same for the first 1000 entries
# We enumerate the URI column of the dataframe so we can index it.
# Then we take its list of tokens and we count the amount of times each token appears
for i, uri in enumerate(df['URI'][:1000]):
    corpus[uri] = {tok: df['processed_text'][i].count(tok) for tok in df['processed_text'][i]}

# revert it back to a dataframe
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns]


In [None]:
# save the dataframe to a csv file
df.to_csv('results.csv', index = True)