In [None]:
data = [
    "Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!",
    "Natural Language Processing, or NLP, enables computers to understand human language.",
    "Text preprocessing is a crucial step in NLP pipelines.",
]

print("Original Data:")
for text in data:
    print(f"- {text}")

Original Data:
- Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!
- Natural Language Processing, or NLP, enables computers to understand human language.
- Text preprocessing is a crucial step in NLP pipelines.


Lowercase

In [None]:
data_lower = [text.lower() for text in data]
print("\nLowercase Data:")
for text in data_lower:
  print(f"- {text}")


Lowercase Data:
- welcome to nitro ai's workshop, hosted by aiis conference on natural language processing!
- natural language processing, or nlp, enables computers to understand human language.
- text preprocessing is a crucial step in nlp pipelines.


Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

nltk.download('punkt_tab')
data_tokens = [word_tokenize(text) for text in data_lower]
print("\nTokenized Data:")
for tokens in data_tokens:
  print(f"- {tokens}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Tokenized Data:
- ['welcome', 'to', 'nitro', 'ai', "'s", 'workshop', ',', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing', '!']
- ['natural', 'language', 'processing', ',', 'or', 'nlp', ',', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines', '.']


Eliminarea punctuatiei

In [None]:
data_no_punctuation = [[word for word in tokens if word.isalnum()] for tokens in data_tokens]
print("\nData without Punctuation:")
for tokens in data_no_punctuation:
  print(f"- {tokens}")


Data without Punctuation:
- ['welcome', 'to', 'nitro', 'ai', 'workshop', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'or', 'nlp', 'enables', 'computers', 'to', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines']


Remove stopwords

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data_no_stopwords = [[word for word in tokens if word not in stop_words] for tokens in data_no_punctuation]
print("\nData without Stopwords:")
for tokens in data_no_stopwords:
  print(f"- {tokens}")


Data without Stopwords:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computers', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipelines']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
data_stemmed = [[ stemmer.stem(word) for word in tokens] for tokens in data_no_stopwords]
print("\nStemmed Data:")
for tokens in data_stemmed:
  print(f"- {tokens}")


Stemmed Data:
- ['welcom', 'nitro', 'ai', 'workshop', 'host', 'aii', 'confer', 'natur', 'languag', 'process']
- ['natur', 'languag', 'process', 'nlp', 'enabl', 'comput', 'understand', 'human', 'languag']
- ['text', 'preprocess', 'crucial', 'step', 'nlp', 'pipelin']


Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
data_lemmatized = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in data_no_stopwords]
print("\nLemmatized Data:")
for tokens in data_lemmatized:
  print(f"- {tokens}")

[nltk_data] Downloading package wordnet to /root/nltk_data...



Lemmatized Data:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computer', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipeline']


In [None]:
data_cleaned = [" ".join(tokens) for tokens in data_lemmatized]
print("\n Cleaned Data:")
for text in data_cleaned:
  print(f"- {text}")


 Cleaned Data:
- welcome nitro ai workshop hosted aiis conference natural language processing
- natural language processing nlp enables computer understand human language
- text preprocessing crucial step nlp pipeline


In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np

vocab = sorted(set(word for sentence in data_cleaned for word in sentence.split()))

encoder= OneHotEncoder(sparse_output=False)
encoded_vocab = encoder.fit_transform(np.array(vocab).reshape(-1,1))

vocab_to_onehot = {word: encoded_vocab[i] for i, word in enumerate(vocab)}

sentence_encodings = []
for sentence  in data_cleaned:
  encoding = [vocab_to_onehot[word] for word in sentence.split() if word in vocab]
  sentence_encodings.append(encoding)

print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[0])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[0].split(),sentence_encodings[0]):
  print(f"{word}: {encoding}")

Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: welcome nitro ai workshop hosted aiis conference natural language processing

Encoding:
welcome: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
nitro: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
ai: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
workshop: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
hosted: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
aiis: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
conference: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0.

In [None]:
print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[1])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[1].split(),sentence_encodings[1]):
  print(f"{word}: {encoding}")

Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: natural language processing nlp enables computer understand human language

Encoding:
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
nlp: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
enables: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
computer: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
understand: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
human: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer=CountVectorizer()
bow_matrix = vectorizer.fit_transform(data_cleaned)

In [None]:
print("Vocab:", vectorizer.get_feature_names_out())
print("Bow Matrix:")
print(bow_matrix.toarray())

Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
Bow Matrix:
[[1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1]
 [0 0 1 0 0 1 0 1 2 1 0 1 0 0 1 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0]]


In [None]:
import pandas as pd

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)

   ai  aiis  computer  conference  crucial  ...  step  text  understand  welcome  workshop
0   1     1         0           1        0  ...     0     0           0        1         1
1   0     0         1           0        0  ...     0     0           1        0         0
2   0     0         0           0        1  ...     1     1           0        0         0

[3 rows x 20 columns]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned)

In [None]:
print("Vocab:", tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())

Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
[[0.338348   0.338348   0.         0.338348   0.         0.
  0.338348   0.         0.25732238 0.25732238 0.338348   0.
  0.         0.         0.25732238 0.         0.         0.
  0.338348   0.338348  ]
 [0.         0.         0.35248004 0.         0.         0.35248004
  0.         0.35248004 0.53614032 0.26807016 0.         0.26807016
  0.         0.         0.26807016 0.         0.         0.35248004
  0.         0.        ]
 [0.         0.         0.         0.         0.42339448 0.
  0.         0.         0.         0.         0.         0.32200242
  0.42339448 0.42339448 0.         0.42339448 0.42339448 0.
  0.         0.        ]]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.2759218  0.        ]
 [0.2759218  1.         0.08631924]
 [0.         0.08631924 1.        ]]


In [None]:
from textblob import TextBlob

for i, text in enumerate(data):
  sentiment = TextBlob(text).sentiment
  print(f"Sentiment for '{text}': Polarity={sentiment.polarity}, Subjectivity={sentiment.subjectivity}")

Sentiment for 'Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!': Polarity=0.4625, Subjectivity=0.65
Sentiment for 'Natural Language Processing, or NLP, enables computers to understand human language.': Polarity=0.05, Subjectivity=0.25
Sentiment for 'Text preprocessing is a crucial step in NLP pipelines.': Polarity=0.0, Subjectivity=1.0
