In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download WordNet data (needed for lemmatization) and punkt for normal functioning of tokinizer
nltk.download('wordnet')
nltk.download('punkt')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in tokens]
    
    # Join lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

# Example text
text = "The dogs are barking loudly outside."

# Lemmatize the text
lemmatized_text = lemmatize_text(text)
print(lemmatized_text)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The dog be bark loudly outside .


In [8]:
from functions import tokenizer, remove_stop_words, remove_upercase, remove_punctuation

import itertools

# Define your functions
def function_1(text):
    return text.upper()

def function_2(text):
    return text.lower()

# Define more functions (function_3 to function_16) in a similar manner

# Store your functions in a list
function_list = [remove_stop_words, remove_upercase, remove_punctuation]  # Add all your functions here

# Example text
input_text = "This is an example text."

# Apply functions in parallel
# Generate combinations of functions and apply them to the text
for r in range(1, len(function_list) + 1):
    for combination in itertools.combinations(function_list, r):
        processed_text = input_text
        for func in combination:
            processed_text = func(processed_text)
            
        if type(processed_text) is str:
            processed_text = tokenizer(processed_text)

        print(f"Processed text with {' -> '.join(f.__name__ for f in combination)}:", processed_text)




Processed text with remove_stop_words: ['example', 'text', '.']
Processed text with remove_upercase: ['this', 'is', 'an', 'example', 'text', '.']
Processed text with remove_punctuation: ['This', 'is', 'an', 'example', 'text']
Processed text with remove_stop_words -> remove_upercase: ['example', 'text', '.']
Processed text with remove_stop_words -> remove_punctuation: ['example', 'text']
Processed text with remove_upercase -> remove_punctuation: ['this', 'is', 'an', 'example', 'text']
Processed text with remove_stop_words -> remove_upercase -> remove_punctuation: ['example', 'text']


In [34]:
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300") # model trained on lower case words, use lower case tokens


In [21]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from functions_preprocessing import flow_preprocessing_1
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from functions_preprocessing import remove_emoji, remove_usernames, remove_hashtags, remove_url, tokenizer

# Sample data
X = ["This is the first document.", "This document is the second document.", "And this is the third one."]
y = [1, 1, 0]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Word2VecVectorizer
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # gets the token that are in the model
        document_embeddings = [np.mean([self.word2vec[token] for token in document if token in self.word2vec], axis=0) 
                               for document in X]

        return np.array(document_embeddings)

# Text Preprocessing Transformer
class TextPreprocessor_flow_1(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        processed_text = [self.preprocess_text(text) for text in X]
        return processed_text

    def preprocess_text(self, text):
        return remove_emoji(remove_usernames(remove_hashtags(remove_url(text))))

# Create the pipeline
word2vec_pipeline = Pipeline([
    ('preprocess', TextPreprocessor_flow_1()),
    ('vectorizer', TfidfVectorizer()),
    ('model', LogisticRegression())
])

pipelines = {"pipe": word2vec_pipeline, 'pipe2': word2vec_pipeline}

# Transform the training data
word2vec_pipeline.fit(X_train, y_train)
word2vec_accuracy = word2vec_pipeline.score(X_test, y_test)
print(word2vec_accuracy)

1.0


In [24]:
import csv

# New data to add in each iteration (example)
new_people = ['pipe name', ["scores"]]

# File path of the existing CSV file
file_path = "data.csv"

# Append new data to the CSV file
with open(file_path, 'a', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(new_people)

In [26]:
import pandas as pd 

df = pd.read_csv('data.csv')
df

Unnamed: 0,pipeline_name,score_data
0,pipe name,['scores']
1,pipe name,['scores']
2,pipe name,['scores']


In [13]:
TextPreprocessor_flow_1().preprocess_text(text = "This is the first document#chill.")

['This', 'is', 'the', 'first', 'document', '.']

In [59]:
import pandas as pd

preprocessed_data_2 = pd.read_json('gender_df_preprocessed_0')

In [60]:
preprocessed_data_2.columns[1]

'female'

In [61]:
# Split the dataset into training and testing sets
X_gender = preprocessed_data_2['post'].tolist()
y_gender = preprocessed_data_2['female'].tolist()

# Split the dataset into training and testing sets
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(X_gender, y_gender, test_size=0.2, random_state=0)

In [62]:
len(X_train_gender), len(y_train_gender), len(X_test_gender), len(y_test_gender)

(35708, 35708, 8927, 8927)

In [63]:
preprocessed_data_2

Unnamed: 0,post,female
0,Good on you for being responsible! I know self...,1
1,"must go to the grocery store with their child,...",1
2,"things on her videos, and YouTube took the vid...",1
3,their app. There's also a program called SYNC ...,1
4,"side. If the cops don't take your side, you'll...",1
...,...,...
44630,if smegma kept her kids away just out of spite...,1
44631,PhDs to change the time on my microwave. I did...,1
44632,HiLIARy could even think of doing! I think Car...,1
44633,of the hand is a breeze. It swells after thoug...,1


In [65]:
from sklearn.feature_extraction.text import CountVectorizer

preprocessed_data_2 = pd.read_json('gender_df_preprocessed_0')

# Split the dataset into training and testing sets
X_gender = preprocessed_data_2['post'].tolist()
y_gender = preprocessed_data_2['female'].tolist()

# Split the dataset into training and testing sets
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(X_gender, y_gender, test_size=0.2, random_state=0)

# Create the pipeline
_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', LogisticRegression())
])

# fit the pipeline
_pipeline.fit(X_train_gender, y_train_gender)

# test pipeline
_pipeline.score(X_test_gender, y_test_gender)

0.9035510249803965

In [66]:
import spacy
nlp = spacy.load("en_core_web_sm")
from urllib.parse import urlparse
import re
import demoji
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

def flow_preprocessing_12(text: str) -> list[str]:
    parsed = urlparse(text)
    # remove url
    text = text.replace(parsed.scheme + "://" + parsed.netloc, "")
    # remove hashtags
    text = re.sub(r'#\w+\b', '', text)
    # remove usernames
    text = re.sub(r'@\w+\b', '', text)
    # remove emoji
    text = demoji.replace(text, '')

    porter = PorterStemmer()
    text = [porter.stem(word) for word in text.split()]

    return " ".join(text)

In [6]:
import nltk
from nltk import word_tokenize, pos_tag

def unite_noun_neighbors(words):
    tagged_words = pos_tag(words)
    unified_tokens = []
    i = 0
    while i < len(tagged_words):
        if tagged_words[i][1].startswith('NN'):
            current_nouns = [tagged_words[i][0]]
            j = i + 1
            while j < len(tagged_words) and tagged_words[j][1].startswith('NN'):
                current_nouns.append(tagged_words[j][0])
                j += 1
            unified_tokens.append(' '.join(current_nouns))
            i = j
        else:
            unified_tokens.append(tagged_words[i][0])
            i += 1
    return 

# Example sentence as a list of tokenized words
tokenized_sentence = ["the", "big", "cat", "is", "running", "in", "the", "park", "Soviet", "Union"]

unified_tokens = unite_noun_neighbors(tokenized_sentence)
print("Unified Tokens:", unified_tokens)


Unified Tokens: the big cat is running in the park Soviet Union
