# Text Preprocessing - NLP

In [3]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------  41.0/42.0 kB 1.9 MB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 675.2 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 3.2 MB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 3.5 MB/s eta 0:00:01
   ------- -------------------------------- 0.3/1.5 MB 2


[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Step 1: Import Necessary Libraries

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import warnings
warnings.filterwarnings("ignore")

### Step 2: Sample Text

In [5]:
text = "This is a simple example: we're going to preprocess this text, removing stopwords and punctuation."

### Step 3: Tokenization

In [15]:
words = word_tokenize(text)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\DEVARSHI/nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\DEVARSHI\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [16]:
words = text.split()
words

['This',
 'is',
 'a',
 'simple',
 'example:',
 "we're",
 'going',
 'to',
 'preprocess',
 'this',
 'text,',
 'removing',
 'stopwords',
 'and',
 'punctuation.']

### Step 4: Text Cleaning

In [11]:
# Remove punctuation and convert to lowercase
cleaned_words = [word.lower() for word in words if word.isalpha()]
cleaned_words

['this',
 'is',
 'a',
 'simple',
 'going',
 'to',
 'preprocess',
 'this',
 'removing',
 'stopwords',
 'and']

### Step 5: Stop Words Removal

In [17]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in cleaned_words if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEVARSHI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


### Step 6: Print Results

In [18]:
print("Original Text:")
print(text)

print("\nTokenized Text:")
print(words)

print("\nCleaned and Stop Words Removed:")
print(filtered_words)

Original Text:
This is a simple example: we're going to preprocess this text, removing stopwords and punctuation.

Tokenized Text:
['This', 'is', 'a', 'simple', 'example:', "we're", 'going', 'to', 'preprocess', 'this', 'text,', 'removing', 'stopwords', 'and', 'punctuation.']

Cleaned and Stop Words Removed:
['simple', 'going', 'preprocess', 'removing', 'stopwords']


In [19]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [None]:
#perticular stop words remove
#create own list
#stop_words = ['this','here']

In [None]:
#add own words in stop_words list
#stop_words.append("words")
#stop_words.except("words")