In [7]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [1]:
import nltk
import string
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from warnings import filterwarnings
filterwarnings('ignore')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

paragraph = """
Apple Inc. is planning to launch a new iPhone in September 2025 in California. 
Tim Cook announced the new features including an AI-powered camera and better battery life.
"""

def clean_and_tokenize(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word not in stopwords.words('english')]

tokens = clean_and_tokenize(paragraph)
print("\n🧹 Cleaned Tokens:\n", tokens)

stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print("\n🌱 Stemming:\n", stems)

pos_tags = pos_tag(tokens)
print("\n🔤 POS Tags:\n", pos_tags)

ner_tree = ne_chunk(pos_tags)
ner_tags = tree2conlltags(ner_tree)
print("\n🏷️ Named Entity Recognition (IOB format):\n", ner_tags)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([" ".join(tokens)])
print("\n📐 TF-IDF Vocabulary:\n", vectorizer.vocabulary_)

# 6. Word2Vec (Optional but powerful)
model = Word2Vec([tokens], vector_size=50, window=2, min_count=1, workers=2)
print("\n📊 Word2Vec Similar Words to 'iphone':\n", model.wv.most_similar('iphone'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bhumi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-


🧹 Cleaned Tokens:
 ['apple', 'inc', 'planning', 'launch', 'new', 'iphone', 'september', '2025', 'california', 'tim', 'cook', 'announced', 'new', 'features', 'including', 'aipowered', 'camera', 'better', 'battery', 'life']

🌱 Stemming:
 ['appl', 'inc', 'plan', 'launch', 'new', 'iphon', 'septemb', '2025', 'california', 'tim', 'cook', 'announc', 'new', 'featur', 'includ', 'aipow', 'camera', 'better', 'batteri', 'life']

🔤 POS Tags:
 [('apple', 'NN'), ('inc', 'NN'), ('planning', 'VBG'), ('launch', 'JJ'), ('new', 'JJ'), ('iphone', 'NN'), ('september', 'NN'), ('2025', 'CD'), ('california', 'NN'), ('tim', 'NN'), ('cook', 'NN'), ('announced', 'VBD'), ('new', 'JJ'), ('features', 'NNS'), ('including', 'VBG'), ('aipowered', 'VBD'), ('camera', 'NN'), ('better', 'RBR'), ('battery', 'NN'), ('life', 'NN')]

🏷️ Named Entity Recognition (IOB format):
 [('apple', 'NN', 'O'), ('inc', 'NN', 'O'), ('planning', 'VBG', 'O'), ('launch', 'JJ', 'O'), ('new', 'JJ', 'O'), ('iphone', 'NN', 'O'), ('september', 'NN



### 📍 Tuple:
```python
('life', 'NN', 'O')
```

### 💡 What Each Element Means:

| Position | Value  | Meaning |
|----------|--------|---------|
| `0`      | `'life'` | The actual **word/token**. |
| `1`      | `'NN'`   | The **Part-of-Speech (POS) tag** for the word. In this case, `'NN'` = Noun, singular. |
| `2`      | `'O'`    | The **NER IOB tag**. `'O'` means the word is **not part of any named entity**. |

---

### 🔠 Common POS Tags:

| Tag | Description       |
|-----|-------------------|
| NN  | Noun, singular    |
| NNS | Noun, plural      |
| VB  | Verb, base form   |
| VBD | Verb, past tense  |
| JJ  | Adjective         |
| RB  | Adverb            |

---

### 🏷️ Common IOB Tags:

| Tag  | Meaning                                   |
|------|-------------------------------------------|
| B-LOC | Beginning of a Location entity           |
| I-LOC | Inside a Location entity                 |
| B-PER | Beginning of a Person entity             |
| I-PER | Inside a Person entity                   |
| B-ORG | Beginning of an Organization entity      |
| I-ORG | Inside an Organization entity            |
| O    | Outside any named entity (non-entity word) |

---

### ✅ Example with Entities:

```python
[('Apple', 'NNP', 'B-ORG'), 
 ('Inc.', 'NNP', 'I-ORG'), 
 ('launched', 'VBD', 'O'),
 ('iPhone', 'NNP', 'O')]
```

- `Apple Inc.` is identified as an **organization**.
- `iPhone` is **not** recognized as an ence downstream tasks like question answering or summarization?