# NLP MINI PROJECT

## Title: Feature Extraction using seven moment variants

In [None]:
pip install spacy nltk scikit-learn



In [None]:
import spacy
import nltk
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk import ne_chunk
from nltk.tree import Tree
from statistics import mean

In [None]:
# Download required NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Load SpaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return tokens, sent_tokenize(text)

def extract_word_frequency(tokens):
    word_freq = Counter(tokens)
    return word_freq

def extract_tfidf(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix.sum(axis=0).A1

def extract_pos_tag_distribution(tokens):
    pos_tags = pos_tag(tokens)
    pos_count = Counter(tag for word, tag in pos_tags)
    total_tags = sum(pos_count.values())
    pos_distribution = {tag: count / total_tags for tag, count in pos_count.items()}
    return pos_distribution

def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.label_ for ent in doc.ents]
    entity_count = Counter(entities)
    return entity_count

def extract_sentence_length_statistics(sentences):
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    avg_sentence_length = mean(sentence_lengths)
    return avg_sentence_length

def extract_lexical_diversity(tokens):
    unique_words = set(tokens)
    ttr = len(unique_words) / len(tokens) if len(tokens) > 0 else 0
    return ttr

def extract_syntactic_parse_depth(text):
    doc = nlp(text)
    parse_depths = [len(list(token.subtree)) for token in doc if token.dep_ == 'punct']
    avg_parse_depth = mean(parse_depths) if parse_depths else 0
    return avg_parse_depth

def extract_features(text):
    tokens, sentences = preprocess_text(text)

    # Feature 1: Word Frequency
    word_frequency = extract_word_frequency(tokens)

    # Feature 2: TF-IDF
    tfidf_scores = extract_tfidf([text])

    # Feature 3: POS Tag Distribution
    pos_distribution = extract_pos_tag_distribution(tokens)

    # Feature 4: Named Entities
    named_entities = extract_named_entities(text)

    # Feature 5: Sentence Length Statistics
    avg_sentence_length = extract_sentence_length_statistics(sentences)

    # Feature 6: Lexical Diversity (Type-Token Ratio)
    lexical_diversity = extract_lexical_diversity(tokens)

    # Feature 7: Syntactic Parse Depth
    avg_parse_depth = extract_syntactic_parse_depth(text)

    features = {
        "word_frequency": word_frequency,
        "tfidf_scores": tfidf_scores,
        "pos_distribution": pos_distribution,
        "named_entities": named_entities,
        "avg_sentence_length": avg_sentence_length,
        "lexical_diversity": lexical_diversity,
        "avg_parse_depth": avg_parse_depth
    }

    return features

In [None]:
# Example usage
text = """
Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between computers and human language.
It enables machines to understand, interpret, and generate human language. NLP is used in many applications like chatbots, translation services, and sentiment analysis.
"""

features = extract_features(text)

# Display extracted features
for feature_name, feature_value in features.items():
    print(f"\n{feature_name}: \n{feature_value}")


word_frequency: 
Counter({'language': 3, 'nlp': 2, 'human': 2, 'natural': 1, 'processing': 1, 'branch': 1, 'artificial': 1, 'intelligence': 1, 'focuses': 1, 'interaction': 1, 'computers': 1, 'enables': 1, 'machines': 1, 'understand': 1, 'interpret': 1, 'generate': 1, 'used': 1, 'many': 1, 'applications': 1, 'like': 1, 'chatbots': 1, 'translation': 1, 'services': 1, 'sentiment': 1, 'analysis': 1})

tfidf_scores: 
[0.12909944 0.38729833 0.12909944 0.12909944 0.12909944 0.12909944
 0.12909944 0.12909944 0.12909944 0.12909944 0.12909944 0.25819889
 0.12909944 0.12909944 0.12909944 0.12909944 0.25819889 0.12909944
 0.38729833 0.12909944 0.12909944 0.12909944 0.12909944 0.25819889
 0.12909944 0.12909944 0.12909944 0.12909944 0.12909944 0.12909944
 0.12909944 0.12909944 0.12909944 0.12909944 0.12909944]

pos_distribution: 
{'JJ': 0.20689655172413793, 'NN': 0.4827586206896552, 'VBZ': 0.06896551724137931, 'NNS': 0.1724137931034483, 'VBN': 0.034482758620689655, 'IN': 0.034482758620689655}

name

## Completed