In [2]:
pip install spacy

Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86

In [6]:
import spacy
from spacy.cli import download

# Download the model programmatically
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m168.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [11]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /home/wsuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
import re
import spacy
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Load SpaCy model for advanced NLP
nlp = spacy.load("en_core_web_sm")

# Sample text data
text_data = [
    "Improving text analytics data quality with advanced NLP techniques.",
    "This is a sample text for demonstrating NLP preprocessing.",
    "Text analytics can be improved using machine learning and NLP."
]

# Text Cleaning Function
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Tokenization and Lemmatization Function (with SpaCy for efficiency)
def preprocess_text(text):
    doc = nlp(text)
    # Remove stopwords and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

# Advanced NLP Processing with SpaCy (Entity, Noun, Verb extraction)
def advanced_nlp_processing(text):
    doc = nlp(text)
    # Extract named entities, nouns, and verbs
    entities = [ent.text for ent in doc.ents]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    return entities, nouns, verbs

# Apply cleaning and preprocessing to text data
cleaned_texts = [clean_text(text) for text in text_data]
preprocessed_texts = [preprocess_text(text) for text in cleaned_texts]

# Apply advanced NLP processing
advanced_results = [advanced_nlp_processing(text) for text in preprocessed_texts]

# TF-IDF Vectorization for feature extraction (with optimized stopword removal)
tfidf_vectorizer = TfidfVectorizer(max_features=10, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_texts)

# Convert TF-IDF matrix to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Output results
print("Cleaned Texts:\n", cleaned_texts)
print("\nPreprocessed Texts:\n", preprocessed_texts)
print("\nAdvanced NLP Results:\n", advanced_results)
print("\nTF-IDF Matrix:\n", tfidf_df)

# Save results to a CSV file (optional)
tfidf_df.to_csv("tfidf_results.csv", index=False)


Cleaned Texts:
 ['improving text analytics data quality with advanced nlp techniques', 'this is a sample text for demonstrating nlp preprocessing', 'text analytics can be improved using machine learning and nlp']

Preprocessed Texts:
 ['improve text analytic datum quality advanced nlp technique', 'sample text demonstrate nlp preprocessing', 'text analytic improve machine learning nlp']

Advanced NLP Results:
 [([], ['text', 'datum', 'quality', 'nlp', 'technique'], ['improve']), ([], ['sample', 'text', 'nlp', 'preprocessing'], ['demonstrate']), ([], ['text', 'machine', 'nlp'], ['improve', 'learning'])]

TF-IDF Matrix:
    advanced  analytic     datum  demonstrate   improve  learning   machine  \
0  0.509353  0.387376  0.509353     0.000000  0.387376  0.000000  0.000000   
1  0.000000  0.000000  0.000000     0.608845  0.000000  0.000000  0.000000   
2  0.000000  0.387376  0.000000     0.000000  0.387376  0.509353  0.509353   

        nlp  preprocessing      text  
0  0.300832       0.00