# Setup
First, let's import all the necessary libraries. You need to have nltk, numpy, and sklearn installed in your environment.

In [None]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#1. Text Tokenization
Tokenization is the process of breaking text into pieces, called tokens.

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Welcome to NLP! This is an introductory tutorial."
print("Word Tokenization:", word_tokenize(text))
print("Sentence Tokenization:", sent_tokenize(text))


Word Tokenization: ['Welcome', 'to', 'NLP', '!', 'This', 'is', 'an', 'introductory', 'tutorial', '.']
Sentence Tokenization: ['Welcome to NLP!', 'This is an introductory tutorial.']


#2. Stop Words Removal
Stop words are common words that are usually removed in NLP processes.

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
filtered_words = [word for word in words if word not in stop_words]

print("Filtered Words:", filtered_words)


Filtered Words: ['Welcome', 'NLP', '!', 'This', 'introductory', 'tutorial', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#3. Stemming and Lemmatization
Stemming and Lemmatization are techniques to reduce words to their root form.

**Lemmatization**
Lemmatization is the process of converting a word to its base or root form called a lemma. It involves the use of vocabulary and morphological analysis of words, aiming to remove inflectional endings to return the base dictionary form of a word, which is known as the lemma.

For example:

The lemma of the word "running" is "run."
The lemma of "better" is "good."
Lemmatization is more sophisticated than stemming and uses lexical knowledge bases (like WordNet) to obtain the correct base forms of words. It understands the context and part of speech of a word, making it more accurate for cases where the word has a complex morphology.

**Stemming**
Stemming is a more rudimentary technique compared to lemmatization. It involves cutting off the ends of words in the hope of achieving the goal correctly most of the time. It works by removing the suffixes from words, thereby simplifying them to their base form.

For example:

The stem of the word "running" is "runn."
The stem of "better" is "bet."
Stemming algorithms are usually rule-based. The most common stemming algorithms, like the Porter stemmer, simply chop off standard affixes from words. This means that stemming can sometimes produce non-words or incorrect stems, which is less of a concern in lemmatization.

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in filtered_words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemmed Words: ['welcom', 'nlp', '!', 'thi', 'introductori', 'tutori', '.']
Lemmatized Words: ['Welcome', 'NLP', '!', 'This', 'introductory', 'tutorial', '.']


#4. Bag of Words and TF-IDF
These are methods to convert text data into numerical form.

In [None]:
# Bag of Words
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(filtered_words)
print("Bag of Words:", bow.toarray())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(filtered_words)
print("TF-IDF:", tfidf.toarray())


Bag of Words: [[0 0 0 0 1]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [0 0 1 0 0]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 0 0 0]]
TF-IDF: [[0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]]


#5. Basic Sentiment Analysis
Let's do a very basic sentiment analysis using a pre-trained model.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
print("Sentiment:", sentiment)


Sentiment: {'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compound': 0.5093}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


# With Keras

Step 1: Import Necessary Libraries

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Step 2: Prepare the Dataset
We'll create a small dataset of sentences and labels for demonstration purposes.

In [None]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?',
    'I hate vegetables',
    'My room is a mess',
    'I dislike noisy places',
    'Do you know how to drive?'
]

labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1 for positive sentiment, 0 for negative sentiment


Step 3: Tokenize the Text

In [None]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=10)


Step 4: Build the Model
We will use a simple architecture with an Embedding layer.

In [None]:
model = Sequential([
    Embedding(100, 16, input_length=10),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])


Step 5: Compile the Model


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Step 6: Train the Model

In [None]:
model.fit(padded, np.array(labels), epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7893d7993c40>

Step 7: Test the Model
You can now test the model with new sentences. First, they need to be tokenized and padded.

In [None]:
test_sentences = ["I enjoy my time at the park", "The movie was terrible"]
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=10)

predictions = model.predict(test_padded)
print(predictions)  # Closer to 1 indicates positive sentiment, closer to 0 indicates negative sentiment.


[[0.50172436]
 [0.49563262]]


# List of NLP Models

Here's a list of some of the best NLP models as of 2023, based on information from various sources:

**GPT-4 (Generative Pre-trained Transformer 4):** Developed by OpenAI, GPT-4 is a multimodal large language model and the fourth in the GPT series. It's known for its capability to process both text and images and is available through ChatGPT Plus and its commercial API​​.

**GPT-3 (Generative Pre-trained Transformer 3):** Preceding GPT-4, GPT-3 is notable for its massive scale, containing 175 billion parameters. It has revolutionized NLP with its ability to generate human-like text and perform a variety of tasks without task-specific fine-tuning​​.

**BERT (Bidirectional Encoder Representations from Transformers):** Introduced by Google, BERT has been a groundbreaking model in NLP, particularly effective in understanding the context of words in sentences. It's used in various tasks like sentiment analysis, question answering, and text classification​​.

**ELMO (Embeddings from Language Models):** ELMO generates contextualized word embeddings using a bidirectional language model. It's been successful in a range of NLP tasks, including sentiment analysis and question answering​​.

**RoBERTa (Robustly Optimized BERT approach):** A variant of BERT, RoBERTa was trained on a larger text corpus and with more advanced techniques, leading to improved performance on many NLP benchmarks​​.

**T5 (Text-to-Text Transfer Transformer):** Developed by Google, T5 can be fine-tuned for various tasks such as text generation and translation. It's known for its performance on tasks including summarization and machine translation​​.

**ALBERT (A Lite BERT):** A more efficient version of BERT, ALBERT maintains similar performance while reducing the number of parameters, making it faster​​.

**XLNet:** An autoregressive NLP model, XLNet outperforms BERT on several benchmarks and is known for its ability to understand language context better​​.

**GPT-2 (Generative Pre-trained Transformer 2):** The predecessor to GPT-3, GPT-2 also achieves impressive results on various NLP tasks, despite having fewer parameters than GPT-3​​.

**ULMFiT (Universal Language Model Fine-tuning):** This model can be fine-tuned for different tasks such as text classification and question answering, leveraging transfer learning to understand natural language structures​​.

**DistilBERT (Distilled BERT):** A streamlined version of BERT, DistilBERT is smaller and faster but still maintains strong performance on NLP tasks​​.

**ELECTRA:** Known for its state-of-the-art performance, ELECTRA replaces some input tokens with synthetic tokens during training, enhancing its language representation capabilities​​.

**GPT (Generative Pre-trained Transformer):** The original GPT model uses a transformer-based architecture and, despite fewer parameters than its successors, performs well on tasks like text generation and machine translation​​.

**XLM-RoBERTa:** A cross-lingual language model by Facebook AI Research, it can handle text in multiple languages, trained on a diverse corpus​​.

**UniLM (Universal Language Model):** UniLM is versatile for tasks like text classification, question answering, and text generation, using both uni-directional and bi-directional transformers​​.

**MobileBERT:** Optimized for mobile devices, MobileBERT is a compact version of BERT that maintains performance while being efficient for mobile applications​​.

**DeBERTa (Decoding-enhanced BERT with Disentangled Attention):** DeBERTa uses disentangled attention mechanisms, excelling in text classification and sentiment analysis​​.

**CTRL (Conditional Transformer Language Model):** CTRL generates text based on specific topics or contexts, guided by user inputs​​.

**GShard:** Designed for large-scale learning, GShard can handle massive data and perform multiple NLP tasks with its hierarchical attention mechanism​​.

**Flair:** A flexible framework that combines different neural network architectures for a range of NLP tasks, including named entity recognition​​.

# With BERT

Step 1: Import Necessary Libraries

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd


Load the BERT Tokenizer and Model

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prepare the Dataset
For simplicity, we'll use a small dataset of sentences and their sentiment labels (1 for positive, 0 for negative).

In [None]:
# Example data
data = [
    {"sentence": "I love this product!", "label": 1},
    {"sentence": "Absolutely great experience.", "label": 1},
    {"sentence": "I am unhappy with the service.", "label": 0},
    {"sentence": "This is a terrible product.", "label": 0}
]

df = pd.DataFrame(data)
df

Preprocess the Data
Convert the sentences to the format BERT expects (input IDs, attention masks, token type IDs).

In [None]:
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review,
                               add_special_tokens=True,
                               max_length=512,
                               pad_to_max_length=True,
                               return_attention_mask=True,
                               return_token_type_ids=False,
                             )

input_features = [convert_example_to_feature(sentence) for sentence in df['sentence']]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Convert to TensorFlow Dataset
Ensure data is in a format suitable for training.

In [None]:
def convert_features_to_tf_dataset(features, labels):
    def gen():
        for f, l in zip(features, labels):
            yield ({'input_ids': f['input_ids'],
                    'attention_mask': f['attention_mask']}, l)

    return tf.data.Dataset.from_generator(gen,
                                          ({'input_ids': tf.int32,
                                            'attention_mask': tf.int32}, tf.int64),
                                          ({'input_ids': tf.TensorShape([None]),
                                            'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))

ds = convert_features_to_tf_dataset(input_features, df['label'].values).batch(8)


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


In [None]:
model.fit(ds, epochs=3, use_multiprocessing=True)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x78931057b1c0>

In [None]:
# Example test data
test_data = [
    {"sentence": "I had a pleasant experience with the service.", "label": 1},
    {"sentence": "The product was awful.", "label": 0},
    {"sentence": "I am delighted with my purchase.", "label": 1},
    {"sentence": "This was the worst experience of my life.", "label": 0}
]

test_df = pd.DataFrame(test_data)

#Convert Test Data for BERT
#Just like the training data, the test data needs to be tokenized and converted into the format expected by BERT.

test_input_features = [convert_example_to_feature(sentence) for sentence in test_df['sentence']]

#Convert to TensorFlow Dataset
#Create a TensorFlow dataset from the test features.

test_ds = convert_features_to_tf_dataset(test_input_features, test_df['label'].values).batch(8)

#Evaluate the Model
#Finally, use the evaluate method to see how the model performs on the test dataset.
loss, accuracy = model.evaluate(test_ds)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

#Predictions (Optional)
#You can also use the model to make predictions on the test dataset and compare them with the actual labels.
predictions = model.predict(test_ds)
predicted_labels = tf.argmax(predictions.logits, axis=1)

for prediction, actual in zip(predicted_labels, test_df['label']):
    print("Predicted Sentiment:", "Positive" if prediction == 1 else "Negative",
          " | Actual Sentiment:", "Positive" if actual == 1 else "Negative")




Test Loss: 0.7293455600738525
Test Accuracy: 0.0
Predicted Sentiment: Negative  | Actual Sentiment: Positive
Predicted Sentiment: Positive  | Actual Sentiment: Negative
Predicted Sentiment: Negative  | Actual Sentiment: Positive
Predicted Sentiment: Positive  | Actual Sentiment: Negative
