#Natural Language Processing with Python
A Comprehensive Cheat Sheet for NLP Tasks and Techniques

# Text Preprocessing
# ▶ Cleaning & Tokenization


In [None]:
import re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  return [w for w in tokens if w not in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## ▶ Stemming vs Lemmatization



In [None]:
# Stemming with NLTK
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmer.stem('running')

# Lemmatization wit spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("I am running in the park")
[token.lemma_ for token in doc] # ['I', 'be', 'run', 'in', 'the','park']

['I', 'be', 'run', 'in', 'the', 'park']


# Preprocessing Tips

-Always lowercase text for consistency.

-Remove stopwords for topic modeling, but keep them for sentiment analysis.

-Use lemmatization over stemming when meaning preservation is important.

-Consider domain-specific preprocessing (e.g., hashtags for social media).


# Feature Extraction
## ▶ Bag of Words sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["Natrual Language Processing.",
          "I love Learning about NLP."]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
X.toarray()

array([[0, 1, 0, 0, 1, 0, 1],
       [1, 0, 1, 1, 0, 1, 0]])


# ▶ TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
X.toarray()



array([[0.        , 0.57735027, 0.        , 0.        , 0.57735027,
        0.        , 0.57735027],
       [0.5       , 0.        , 0.5       , 0.5       , 0.        ,
        0.5       , 0.        ]])

# ▶ Word Embeddings Gensim

In [None]:
from gensim.models import Word2Vec
sentences = [["natrual", "language"], ["machine", "learning"]]

model = Word2Vec(sentences, min_count=1, vector_size=100, window=5)
vector = model.wv['natrual']
simlar = model.wv.most_similar('natrual', topn=5)
print(vector)
print(simlar)

# When to Use Each Feature Type

### -BoW/TF-IDF: Text classification, document clustering.

### -Word Embeddings: Semantic tasks, text similarity, transfer learning.

### -Contextual Embeddings: Advanced tasks requiring context understanding.

# Text Classification
# ▶ Basic Pipeline sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline



X_train = ["I love this product", "This is terrible"]
y_train = ["positive", "negative"]

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

text_clf.fit(X_train, y_train)

text_clf.predict(["This was awesome"])

array(['negative'], dtype='<U8')

# ▶ Using Transformers Transformers

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier("I love this product")
# r("I've been waiting for this movie!")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998788833618164}]

In [None]:
classifier(r"I've been waiting for this movie!")

[{'label': 'POSITIVE', 'score': 0.9878816604614258}]

* Classification Task   

  *   Sentiment Analysis
  *   Topic Classification
  *   Intent Recognition
  *   Spam Detection






---



*   Recommended Approach

  *   VADER (rule-based) or fine-tuned BERT

  *   TF-IDF + SVM or DistilBERT
  *   Fine-tuned RoBERTa
  *  TF-IDF + Naive Bayes






# Named Entity Recognition
# ▶ spaCy NER spaCy

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is buying U.K. startup for $1 billion")
for ent in doc.ents:
  print(ent.text, ent.label_)
# Apple ORG
# U.K. GPE
# $1 billion MONEY

Apple ORG
U.K. GPE
$1 billion MONEY


▶ Transformers NER Transformers


Common Entity Types

*   PER/PERSON: People names
*   ORG: Organizations, companies
*   LOC/GPE: Locations, geopolitical entities
*   DATE/TIME: Temporal expressions
*   MONEY: Monetary values








In [None]:
from transformers import pipeline

ner = pipeline("ner")
text = "My name is Sarah and I work at Google in London"
ner_results = ner(text)

ner_results
# [{'entity': 'I-PER', 'score': 0.99, 'word': 'Sarah'}, ...]

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cpu


[{'entity': 'I-PER',
  'score': np.float32(0.9986339),
  'index': 4,
  'word': 'Sarah',
  'start': 11,
  'end': 16},
 {'entity': 'I-ORG',
  'score': np.float32(0.9985827),
  'index': 9,
  'word': 'Google',
  'start': 31,
  'end': 37},
 {'entity': 'I-LOC',
  'score': np.float32(0.99839896),
  'index': 11,
  'word': 'London',
  'start': 41,
  'end': 47}]

#Sentiment Analysis
#▶ TextBlob TextBlob



In [None]:
from textblob import TextBlob

text = "The movie was absolutely amazing!"
blob = TextBlob(text)

# Polarity: -1 (negative) to 1 (positive)
print(blob.sentiment.polarity)
# Subjectivity: 0 (objective) to 1 (subjective)
print(blob.sentiment.subjectivity)

0.7500000000000001
0.9


#▶ VADER Sentiment NLTK



In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
text = "The movie was absolutely amazing!"
scores = sia.polarity_scores(text)
print(scores)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


{'neg': 0.0, 'neu': 0.477, 'pos': 0.523, 'compound': 0.6581}


Sentiment Analysis Tips


*   Rule-based approaches work well for straightforward text
*   Consider using domain-specific models for specialized content
*   ML/DL approaches handle context, sarcasm, and negation better
*   Use BERT variants for state-of-the-art performance






Topic Modeling
▶ LDA with Gensim Gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
docs = [
  "Machine learning is a subset of AI",
  "NLP is used for text analysis"
]
# Tokenize
tokenized_docs = [doc.lower().split() for doc in docs]
# Create dictionary & corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
# Train LDA model
lda_model = LdaModel(
  corpus=corpus,
  id2word=dictionary,
  num_topics=2,
  passes=10
)
# Print topics
topics = lda_model.print_topics()
for topic in topics:
  print(topic)

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
docs = [
  "Machine learning is a subset of AI",
  "NLP is used for text analysis"
]

# Tokenize
tokenized_docs = [doc.lower().split() for doc in docs]
# Create dictionary & corpus
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)
# Print topics
topics = lda_model.print_topics()
for topic in topics:
  print(topic)

Advanced Techniques
#▶ Text Summarization Transformers



In [None]:
from transformers import pipeline
summarizer = pipeline("summarization")
long_text = """ NLP is a field of AI that focuses on ..."""
summary = summarizer(long_text, max_length=100, min_length=30)
print(summary[0]['summary_text'])

# ▶ Translation Transformers

In [None]:

from transformers import pipeline
translator = pipeline("translation_en_to_fr")
translation = translator("Hello, how are you?")
print(translation[0]['translation_text'])


#▶ Question Answering Transformers

In [None]:
from transformers import pipeline
qa = pipeline("question-answering")
context = "Python is a programming language created by..."
question = "Who created Python?"
result = qa(question=question, context=context)
print(result['answer'])



```
Task              |Beginner Approach        |Advanced Approach
-----------------------------------------------------------------
Summarization     |Extractive (TextRank)    |Abstractive (T5, BART)
Translation       |Pre-trained pipeline     |Custom Seq2Seq models
Q&A               |Rule-based systems       |Fine-tuned BERT/T5
Text Generation   |Markov Chains            |GPT models
```



#NLP Project Evaluation & Tips

---


* Evaluation Metrics by Task
  * Classification: Accuracy, F1-score, Precision, Recall
  * NER: F1-score, Precision, Recall (by entity type)
  * Summarization: ROUGE-N, ROUGE-L, BLEU
  * Translation: BLEU, METEOR, TER
  * Generation: Perplexity, human evaluation





* Best Practices for NLP Projects
  * Start simple: Try basic models before complex ones
  * Clean your data thoroughly: Good preprocessing is crucial
  * Consider context: Many NLP problems need contextual understanding
  * Leverage pre-trained models: Often outperform models trained from scratch
  * Handle class imbalance: Use oversampling or adjusted weights
  * Use cross-validation: Especially for small datasets
  * Evaluate properly: Choose appropriate metrics for your task
