In [None]:
import re#re stands fr rregular expression and is use for pattern matching
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import pos_tag,ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
import pandas as pd
from typing import List,Dict,Union
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
class NLPTextProcessor:
  def __init__(self):
    self.stop_words=set(stopwords.words('english'))
    self.stemmer=PorterStemmer()
    self.lemmatizer=WordNetLemmatizer()
    self.sentiment_analyzer=SentimentIntensityAnalyzer()
    self.nlp=spacy.load('en_core_web_sm')#here are loading a pretrained model

  def clean_text(self,text:str)->str:#text preprocessing
    text=text.lower()
    #remove urls
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

  def tokenize_words(self,text:str)->List[str]:
    return word_tokenize(text)#["apple","is","a","good","company"]

  def tokenize_sentences(self,text:str)->List[str]:
    return sent_tokenize(text)

  def remove_stopwords(self,tokens:List[str])->List[str]:#["apple","is","good","company"]
    return [word for word in tokens if word not in self.stop_words]

  def perform_stemming(self,tokens:List[str])->List[str]:
    return [self.stemmer.stem(word) for word in tokens]

  def perform_lemmatization(self,tokens:List[str])->List[str]:
    return [self.lemmatizer.lemmatize(word) for word in tokens]

  def get_pos_tags(self,tokens:List[str])->List[tuple]:
     return pos_tag(tokens)

  def get_named_entities(self,text:str)->str:#here we are doing name entity recognition using nltk

    tokens=word_tokenize(text)#word tokenize
    pos_tags=pos_tag(tokens)#part of speech tagging
    named_entities=ne_chunk(pos_tags)#name entity recognition

  def get_spacy_entities(self,text:str)->Dict[str,List[str]]:
    doc=self.nlp(text)
    entities={
        "PERSON":[],
        "ORG":[],
        "GPE":[],
        "DATE":[],
        "PRODUCT":[],
    }
    for ent in doc.ents:
      if ent.label_ in entities:
        entities[ent.label_].append(ent.text)
    return entities

  def get_nltk_sentiment(self,text:str)->Dict[str,float]:#sentiment analysis using nltk
      return self.sentiment_analyzer.polarity_scores(text)

  def get_textblob_sentiment(self,text:str)->Dict[str,float]:
      analysis=TextBlob(text)
      return{
          'polarity':analysis.sentiment.polarity,
          'subjectivity':analysis.sentiment.subjectivity,

      }
  def process_text(self,text):#here iam defining process text function
    cleaned_text=self.clean_text(text)

    words=self.tokenize_words(cleaned_text)

    words_no_stop=self.remove_stopwords(words)
    lemmas=self.perform_lemmatization(words_no_stop)
    pos_tags=self.get_pos_tags(lemmas)
    sentences=self.tokenize_sentences(text)

    nltk_sentiment=self.get_nltk_sentiment(text)
    textblob_sentiment=self.get_textblob_sentiment(text)
    spacy_entities=self.get_spacy_entities(text)
    nltk_name_entities=self.get_named_entities(text)
    return {
            "original_text": text,
            "cleaned_text": cleaned_text,
            "tokens": words,
            "tokens_no_stopwords": words_no_stop,
            "lemmas": lemmas,
            "pos_tags": pos_tags,
            "sentences": sentences,
            "nltk_sentiment": nltk_sentiment,
            "textblob_sentiment": textblob_sentiment,
            "named_entities": spacy_entities,
            'nltk_name_entities':nltk_name_entities,
        }

  def label_sentiment(self,compound:float)->str:
    if compound>=0.05:
      return "positive"
    elif compound<=-0.05:
      return "negative"
    else:
      return "neutral"


  def analyze_reviews(self,filepath:str)->pd.DataFrame:
      df=pd.read_csv(filepath)
      if 'review' not in df.columns:
        raise ValueError("csv file muust contain a review column")
      results=[]
      true_labels=[]
      predicted_labels=[]
      for _,row in df.iterrows():
        review=str(row['review'])
        true_label=str(row['label']).lower().strip() if 'label' in row else None
        processed=self.process_text(review)#ye results lekr aa rha
        compound=processed['nltk_sentiment']#yaaha polarity ke 4 scores store ho rhe +ve,-ve,neutral and compound
        predicted_label=self.label_sentiment(compound['compound'])#yaha sirf compound score calling part me pass ho rha
        results.append({
            'review':review,
            'nlkt_compound':compound,
            'nltk_sentiment':predicted_label,
            'entities':processed['named_entities'],
          'textblob_polarity':processed['textblob_sentiment'],

        })
        if true_label:
          true_labels.append(true_label)
        predicted_labels.append(predicted_label)
        result_df=pd.DataFrame(results)
        print(classification_report(true_labels,predicted_labels))
        print(confusion_matrix(true_labels,predicted_labels))
        return result_df



if __name__=="__main__":
   processor=NLPTextProcessor()
   sample_text = """
    Natural language processing (NLP) is a subfield of linguistics, computer science,
    and artificial intelligence concerned with the interactions between computers and human language.
    Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
    I love NLP! It's amazing how computers can understand human language.
    """
   processed_data=processor.process_text(sample_text)#here iam calling process text function

   print("Original Text:")
   print(processed_data["original_text"])
   print("\nCleaned Text:")
   print(processed_data["cleaned_text"])
   print("\nLemmatized Tokens:")
   print(processed_data["lemmas"])
   print("\nPOS Tags:")
   print(processed_data["pos_tags"])
   print("\nSentiment Analysis (NLTK):")
   print(processed_data["nltk_sentiment"])
   print("\nNamed Entities (SpaCy):")
   print(processed_data["named_entities"])
   fetched_df=processor.analyze_reviews("sample_reviews.csv")
   print(fetched_df)




Original Text:

    Natural language processing (NLP) is a subfield of linguistics, computer science,
    and artificial intelligence concerned with the interactions between computers and human language.
    Apple Inc. is an American multinational technology company headquartered in Cupertino, California.
    I love NLP! It's amazing how computers can understand human language.
    

Cleaned Text:
natural language processing nlp is a subfield of linguistics computer science and artificial intelligence concerned with the interactions between computers and human language apple inc is an american multinational technology company headquartered in cupertino california i love nlp its amazing how computers can understand human language

Lemmatized Tokens:
['natural', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'computer', 'human', 'language', 'apple', 'inc', 'american', 'multinational', 'technolog



In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import nltk
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

In [None]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True