In [5]:
import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
def preprocess(text, stop_words_removal=True, lemmatize=True, min_word_len=2):
    """
    Returns the preprocessed form of the input `text` as a string.
    """
    stop_words = stopwords.words('english')
    text_processed = re.sub('[^a-zA-Z]', ' ', text) # Remove non-alphabetic characters
    text_processed = re.sub('\s+', ' ', text_processed) # Replace multiple spaces with single space
    text_processed = text_processed.lower() # Lower case
    
    text_words = text_processed.split() # Tokenize
    text_words = [w for w in text_words if len(w) > 2] # remove words too short
    if stop_words_removal:
        text_words = [w for w in text_words if not w in stop_words] # Stopwords removal
    if lemmatize:
        text_words = [WordNetLemmatizer().lemmatize(w) for w in text_words]  # Lemmatization
    
    return ' '.join(text_words)


In [8]:
def preprocess_text_feature(df, feature, ngram=2, vectorizer=None):
    corpus = df[feature]
    
    if vectorizer is None:
        vectorizer = TfidfVectorizer(preprocessor=preprocess, min_df=2, max_df=len(df), ngram_range=(ngram,ngram), max_features=300).fit(corpus)
    
    transformed_matrix = vectorizer.transform(corpus)
    transformed_df = pd.DataFrame(transformed_matrix.todense())
    transformed_df = transformed_df.set_axis(vectorizer.vocabulary_, axis=1, inplace=False)  # rename the columns
    
    return transformed_df, vectorizer
