In [5]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Sample text
text = "running runs easily fair fairness"
tokens = word_tokenize(text)

# Stemming
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemming:", stemmed_words)

# Lemmatization using NLTK
lemmatized_words_nltk = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatization (NLTK):", lemmatized_words_nltk)

# Lemmatization using spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
lemmatized_words_spacy = [token.lemma_ for token in doc]
print("Lemmatization (spaCy):", lemmatized_words_spacy)

# Part-of-Speech (POS) Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tagging:", pos_tags)

# Dependency Parsing
for token in doc:
    print(f"{token.text} -> {token.dep_} -> {token.head.text}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Stemming: ['run', 'run', 'easili', 'fair', 'fair']
Lemmatization (NLTK): ['running', 'run', 'easily', 'fair', 'fairness']
Lemmatization (spaCy): ['run', 'run', 'easily', 'fair', 'fairness']
POS Tagging: [('running', 'VBG'), ('runs', 'NNS'), ('easily', 'RB'), ('fair', 'JJ'), ('fairness', 'NN')]
running -> amod -> runs
runs -> ROOT -> runs
easily -> advmod -> fair
fair -> amod -> fairness
fairness -> dobj -> runs


In [9]:
pip install allennlp allennlp-models

Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl.metadata (21 kB)
Collecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl.metadata (23 kB)
INFO: pip is looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
Collecting allennlp
  Downloading allennlp-2.10.0-py3-none-any.whl.metadata (20 kB)
  Downloading allennlp-2.9.3-py3-none-any.whl.metadata (19 kB)
  Downloading allennlp-2.9.2-py3-none-any.whl.metadata (19 kB)
  Downloading allennlp-2.9.1-py3-none-any.whl.metadata (19 kB)
  Downloading allennlp-2.9.0-py3-none-any.whl.metadata (18 kB)
  Downloading allennlp-2.8.0-py3-none-any.whl.metadata (17 kB)
  Downloading allennlp-2.7.0-py3-none-any.whl.metadata (17 kB)
INFO: pip is still looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
  Downloading allennlp-2.6.0-py3-none-any.whl.metadata (17 k

In [11]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
from nltk.wsd import lesk
from collections import defaultdict
# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Sample text
text = "John met Mary in New York. He said he would visit her again next Monday."
tokens = word_tokenize(text)

# Stemming
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemming:", stemmed_words)

# Lemmatization using NLTK
lemmatized_words_nltk = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatization (NLTK):", lemmatized_words_nltk)

# Lemmatization using spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
lemmatized_words_spacy = [token.lemma_ for token in doc]
print("Lemmatization (spaCy):", lemmatized_words_spacy)

# Part-of-Speech (POS) Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tagging:", pos_tags)

# Dependency Parsing
for token in doc:
    print(f"{token.text} -> {token.dep_} -> {token.head.text}")


# Named Entity Recognition (NER)
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Word Sense Disambiguation (WSD)
wsd_results = {word: lesk(tokens, word) for word in tokens}
print("Word Sense Disambiguation:", {word: synset.definition() if synset else None for word, synset in wsd_results.items()})


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stemming: ['john', 'met', 'mari', 'in', 'new', 'york', '.', 'he', 'said', 'he', 'would', 'visit', 'her', 'again', 'next', 'monday', '.']
Lemmatization (NLTK): ['John', 'met', 'Mary', 'in', 'New', 'York', '.', 'He', 'said', 'he', 'would', 'visit', 'her', 'again', 'next', 'Monday', '.']
Lemmatization (spaCy): ['John', 'meet', 'Mary', 'in', 'New', 'York', '.', 'he', 'say', 'he', 'would', 'visit', 'she', 'again', 'next', 'Monday', '.']
POS Tagging: [('John', 'NNP'), ('met', 'VBD'), ('Mary', 'NNP'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('.', '.'), ('He', 'PRP'), ('said', 'VBD'), ('he', 'PRP'), ('would', 'MD'), ('visit', 'VB'), ('her', 'PRP$'), ('again', 'RB'), ('next', 'JJ'), ('Monday', 'NNP'), ('.', '.')]
John -> nsubj -> met
met -> ROOT -> met
Mary -> dobj -> met
in -> prep -> met
New -> compound -> York
York -> pobj -> in
. -> punct -> met
He -> nsubj -> said
said -> ROOT -> said
he -> nsubj -> visit
would -> aux -> visit
visit -> ccomp -> said
her -> dobj -> visit
again -> adv

In [13]:
import spacy
from spacy.tokens import Span
from spacy.pipeline import EntityRuler

def extract_entities_relations(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    entities = [(ent.text, ent.label_) for ent in doc.ents]
    relations = []

    for token in doc:
        if token.dep_ in ("nsubj", "dobj", "pobj") and token.head.pos_ in ("VERB", "NOUN"):
            relations.append((token.text, token.dep_, token.head.text))

    return entities, relations

if __name__ == "__main__":
    text = "I just met a shop keeper in Karachi."
    entities, relations = extract_entities_relations(text)

    print("Entities:")
    for entity in entities:
        print(entity)

    print("\nRelations:")
    for relation in relations:
        print(relation)

Entities:
('Karachi', 'GPE')

Relations:
('I', 'nsubj', 'met')
('keeper', 'dobj', 'met')


In [17]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Sample dataset for demonstration
data = pd.DataFrame({
    'text': [
        "I love this product! It's amazing.",
        "This is the worst experience ever.",
        "I feel okay about this.",
        "Exclusive deal just for you! Buy now!",
        "Hello, hope you are doing well!",
        "Congratulations! You won a lottery. Click here!"
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'spam', 'not spam', 'spam']
})

# Sentiment Analysis
def sentiment_analysis():
    df = data[data['sentiment'].isin(['positive', 'negative', 'neutral'])]
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['text'])
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print("Sentiment Analysis Accuracy:", accuracy_score(y_test, predictions))

# Topic Modeling
def topic_modeling():
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(data['text'])
    lda = LatentDirichletAllocation(n_components=2, random_state=42)
    lda.fit(X)
    print("Topic Modeling (LDA) Results:")
    for index, topic in enumerate(lda.components_):
        print(f"Topic {index}:", [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]])

# Spam Detection
def spam_detection():
    df = data[data['sentiment'].isin(['spam', 'not spam'])]
    df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'spam' else 0)
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['text'])
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print("Spam Detection Accuracy:", accuracy_score(y_test, predictions))

# Running the functions
spam_detection()

Spam Detection Accuracy: 0.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'spam' else 0)
