In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download WordNet data (needed for lemmatization) and punkt for normal functioning of tokinizer
nltk.download('wordnet')
nltk.download('punkt')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in tokens]
    
    # Join lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

# Example text
text = "The dogs are barking loudly outside."

# Lemmatize the text
lemmatized_text = lemmatize_text(text)
print(lemmatized_text)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The dog be bark loudly outside .


In [8]:
from functions import tokenizer, remove_stop_words, remove_upercase, remove_punctuation

import itertools

# Define your functions
def function_1(text):
    return text.upper()

def function_2(text):
    return text.lower()

# Define more functions (function_3 to function_16) in a similar manner

# Store your functions in a list
function_list = [remove_stop_words, remove_upercase, remove_punctuation]  # Add all your functions here

# Example text
input_text = "This is an example text."

# Apply functions in parallel
# Generate combinations of functions and apply them to the text
for r in range(1, len(function_list) + 1):
    for combination in itertools.combinations(function_list, r):
        processed_text = input_text
        for func in combination:
            processed_text = func(processed_text)
            
        if type(processed_text) is str:
            processed_text = tokenizer(processed_text)

        print(f"Processed text with {' -> '.join(f.__name__ for f in combination)}:", processed_text)




Processed text with remove_stop_words: ['example', 'text', '.']
Processed text with remove_upercase: ['this', 'is', 'an', 'example', 'text', '.']
Processed text with remove_punctuation: ['This', 'is', 'an', 'example', 'text']
Processed text with remove_stop_words -> remove_upercase: ['example', 'text', '.']
Processed text with remove_stop_words -> remove_punctuation: ['example', 'text']
Processed text with remove_upercase -> remove_punctuation: ['this', 'is', 'an', 'example', 'text']
Processed text with remove_stop_words -> remove_upercase -> remove_punctuation: ['example', 'text']


In [2]:
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300") # model trained on lower case words, use lower case tokens


In [11]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from functions_preprocessing import flow_preprocessing_1
from sklearn.linear_model import LogisticRegression

# Sample data
X = ["This is the first document.", "This document is the second document.", "And this is the third one."]
y = [1, 1, 0]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Word2VecVectorizer
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # gets the token that are in the model
        document_embeddings = [np.mean([self.word2vec[token] for token in document if token in self.word2vec], axis=0) 
                               for document in X]

        return np.array(document_embeddings)

# Text Preprocessing Transformer
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        processed_text = [self.preprocess_text(text) for text in X]
        return processed_text

    def preprocess_text(self, text):
        tokens = flow_preprocessing_1(text=text)
        return " ".join(tokens)

# Create the pipeline
word2vec_pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('vectorizer', Word2VecVectorizer(word2vec_model)),
    ('model', RandomForestClassifier())
])

pipelines = {"pipe": word2vec_pipeline, 'pipe2': word2vec_pipeline}

# Transform the training data
pipelines['pipe2'].fit(X_train, y_train)
word2vec_accuracy = word2vec_pipeline.score(X_test, y_test)
print(word2vec_accuracy)

0.0


KeyboardInterrupt: 