In [1]:
!pip install nltk



In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Convert to lowercase
    words = [word.lower() for word in words]

    # Remove punctuation and non-alphabetic tokens
    words = [word for word in words if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words_no_stopwords = [word for word in words if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words_stemmed = [stemmer.stem(word) for word in words_no_stopwords]

    return words_no_stopwords, words_stemmed

In [10]:
text = """Natural Language Processing (NLP) is a field of artificial intelligence that gives the machines the ability to read, understand and derive meaning from human languages."""

words_no_stopwords, words_stemmed = preprocess_text(text)

print("After stop word removal:", words_no_stopwords)
print("After stemming:", words_stemmed)

After stop word removal: ['natural', 'language', 'processing', 'nlp', 'field', 'artificial', 'intelligence', 'gives', 'machines', 'ability', 'read', 'understand', 'derive', 'meaning', 'human', 'languages']
After stemming: ['natur', 'languag', 'process', 'nlp', 'field', 'artifici', 'intellig', 'give', 'machin', 'abil', 'read', 'understand', 'deriv', 'mean', 'human', 'languag']


In [11]:
input_file_path = 'input.txt'
with open(input_file_path, 'r') as file:
    text = file.read()

In [12]:
words_no_stopwords, words_stemmed = preprocess_text(text)

# Save the results to output files
output_no_stopwords_file_path = 'output_no_stopwords.txt'
output_stemmed_file_path = 'output_stemmed.txt'

with open(output_no_stopwords_file_path, 'w') as file:
    file.write(' '.join(words_no_stopwords))

with open(output_stemmed_file_path, 'w') as file:
    file.write(' '.join(words_stemmed))

print(f"Processed text saved to {output_no_stopwords_file_path} and {output_stemmed_file_path}")

Processed text saved to output_no_stopwords.txt and output_stemmed.txt
