In [11]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Bidirectional, Dropout
from transformers import pipeline
import spacy
import spacy.cli
import nltk
from nltk.corpus import stopwords

# Ensure stopwords and other NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Check and download the spaCy model if not already installed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Model 'en_core_web_sm' not found. Downloading it now...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Predefined stopwords list
stop_words = set(stopwords.words('english'))

# Load the data
def load_data(file_name):
    return pd.read_csv(file_name)

# Tokenizer for Neural Network Methods
def tokenize_and_pad(texts, max_len=128, vocab_size=10000):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded_sequences, tokenizer

# Extract NLP features
def generate_advanced_nlp_features(df):
    # Feature 1: Lexical Diversity
    df['lexical_diversity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()))

    # Feature 2: Readability Score (Flesch Reading Ease)
    def flesch_reading_ease(text):
        total_words = len(text.split())
        total_sentences = len(TextBlob(text).sentences)
        syllables = sum(len(word) for word in text.split())  # Simplified syllable approximation
        if total_sentences == 0 or total_words == 0:
            return 0
        return 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (syllables / total_words)
    
    df['readability_score'] = df['text'].apply(flesch_reading_ease)

    # Feature 3: Average Sentence Length
    df['avg_sentence_length'] = df['text'].apply(lambda x: np.mean([len(sentence.split()) for sentence in TextBlob(x).sentences]))

    # Feature 4 & 5: Sentiment Polarity & Subjectivity
    df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['sentiment_subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

    # Feature 6: Presence of Negation Words
    negation_words = {"not", "no", "never", "none"}
    df['negation_count'] = df['text'].apply(lambda x: sum(word in negation_words for word in x.lower().split()))

    # Feature 7: TF-IDF Weighted Scores for Key Phrases
    vectorizer = TfidfVectorizer(max_features=5)
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    df['tfidf_top_keyword_sum'] = tfidf_matrix.sum(axis=1).A1

    # Feature 8: Count of Action Words (Verbs)
    df['action_word_count'] = df['text'].apply(lambda x: len([word for word, pos in nltk.pos_tag(x.split()) if pos.startswith('VB')]))

    # Feature 9: Named Entity Recognition (NER) presence
    features = {"movie", "actor", "entertainment", "studio", "award"}
    df['feature_mentions'] = df['text'].apply(lambda x: sum(word in features for word in x.lower().split()))

    # Feature 10: Word Frequency Ratio for Top Adjectives
    df['adjective_ratio'] = df['text'].apply(lambda x: len([word for word, pos in nltk.pos_tag(x.split()) if pos.startswith('JJ')]) / len(x.split()))

    # Feature 11: Topic Modeling via LDA (Similarity Score)
    lda = LDA(n_components=3, random_state=42)
    lda_features = lda.fit_transform(tfidf_matrix)
    df['lda_topic_similarity'] = np.max(lda_features, axis=1)

    # Feature 12: Noun-Verb Ratio
    df['noun_verb_ratio'] = df['text'].apply(lambda x: len([word for word, pos in nltk.pos_tag(x.split()) if pos.startswith('NN')]) / (1 + len([word for word, pos in nltk.pos_tag(x.split()) if pos.startswith('VB')])))

    # Feature 13: Cosine Similarity to Template
    template = "This is a good movie with lots of entertainment value."
    vector_template = vectorizer.transform([template])
    df['cosine_similarity_to_template'] = (tfidf_matrix @ vector_template.T).toarray().flatten()

    # Feature 14: Syntactic Complexity (Compound vs Simple sentence ratio)
    def syntactic_complexity(text):
        sentences = TextBlob(text).sentences
        simple_count = len([s for s in sentences if len(s.split()) <= 10])
        compound_count = len([s for s in sentences if len(s.split()) > 10])
        return compound_count / (1 + simple_count)
    
    df['syntactic_complexity'] = df['text'].apply(syntactic_complexity)

    # Feature 15: Sentiment Scores with VADER
    df['vader_sentiment'] = df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

    # Feature 16: Named Entity Counts with spaCy
    def entity_counts(text):
        doc = nlp(text)
        entities = {'PERSON': 0, 'ORG': 0, 'GPE': 0}
        for ent in doc.ents:
            if ent.label_ in entities:
                entities[ent.label_] += 1
        return entities

    df['person_count'] = df['text'].apply(lambda x: entity_counts(x)['PERSON'])
    df['organization_count'] = df['text'].apply(lambda x: entity_counts(x)['ORG'])
    df['location_count'] = df['text'].apply(lambda x: entity_counts(x)['GPE'])

    # Feature 17: Bi-Gram Frequency
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5)
    bigram_matrix = bigram_vectorizer.fit_transform(df['text'])
    df['bigram_top_keyword_sum'] = bigram_matrix.sum(axis=1).A1
    
    return df

# Generate Neural Network-based NLP features
def generate_nn_nlp_features(df):
    texts = df['text'].values

    # Feature 18: Word Embeddings (using pre-trained GloVe embeddings)
    max_len = 100
    vocab_size = 10000
    embedding_dim = 50
    padded_sequences, tokenizer = tokenize_and_pad(texts, max_len, vocab_size)

    # Create embedding layer using GloVe pre-trained vectors
    embeddings_index = {}
    try:
        with open('glove.6B.50d.txt', 'r', encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
    except FileNotFoundError:
        print("GloVe embeddings file not found. Please ensure 'glove.6B.50d.txt' is available.")
        return df

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, index in tokenizer.word_index.items():
        if index < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

    # Feature 19: Sentence Embeddings via Bidirectional LSTM
    model_lstm = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable=False),
        Bidirectional(LSTM(64)),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model_lstm.compile(optimizer='adam', loss='mse')
    lstm_features = model_lstm.predict(padded_sequences)

    # Feature 20: Sentence Embeddings via BERT Pipeline with Truncation
    bert_embedder = pipeline("feature-extraction", model="bert-base-uncased", tokenizer="bert-base-uncased", device=0 if tf.test.is_gpu_available() else -1)

    def bert_embeddings(texts):
        embeddings = []
        for text in texts:
            inputs = bert_embedder(text, truncation=True, max_length=512, padding='max_length')
            embeddings.append(np.mean(inputs[0], axis=0))
        return np.array(embeddings)

    bert_embeds = bert_embeddings(texts)

    # Feature 21: Cosine Similarity between BERT Embeddings and Template Embedding
    template = "This is a comfortable, modern house with plenty of amenities and spacious rooms."
    template_embed = bert_embeddings([template])[0]
    cosine_similarities = [cosine_similarity([bert_embeds[i]], [template_embed])[0][0] for i in range(len(bert_embeds))]

    # Feature 22: RNN-Based Sentence Complexity (using GRU)
    model_gru = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable=False),
        GRU(64),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model_gru.compile(optimizer='adam', loss='mse')
    complexity_scores = model_gru.predict(padded_sequences)

    # Add features to DataFrame
    df['lstm_features'] = lstm_features.flatten()
    df['bert_embedding_mean'] = bert_embeds.mean(axis=1)
    df['cosine_similarity_template'] = cosine_similarities
    df['rnn_complexity_score'] = complexity_scores.flatten()

    return df

# Main function to load data, process it, and save the output
def main(input_file):
    df = load_data(input_file)
    processed_data = generate_advanced_nlp_features(df)
    processed_data = generate_nn_nlp_features(processed_data)

    # Create output file name
    base_name = os.path.splitext(input_file)[0]
    output_file = f"{base_name}_with_new_NLP_features.csv"

    # Save processed data
    processed_data.to_csv(output_file, index=False)
    print(f"The updated data file with advanced NLP features has been saved as '{output_file}'.")

# Example usage:
main("IMDB Dataset.csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaelrivera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelrivera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/michaelrivera/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 194ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
The updated data file with advanced NLP features has been saved as 'IMDB Dataset_with_new_NLP_features.csv'.
