# Text Cleaner


In [10]:
import pandas as pd
from typing import Union
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from bs4 import BeautifulSoup
from contractions import contractions_dict


In [11]:
def remove_repetitive_words(dataframe: pd.DataFrame, column: Union[str, int]) -> pd.DataFrame:
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"

    def remove_duplicates(text):
        if pd.isna(text):
            return text
        words = text.split()
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen:
                unique_words.append(word)
                seen.add(word)
        return ' '.join(unique_words)

    df_copy = dataframe.copy()
    df_copy[column] = df_copy[column].apply(remove_duplicates)
    return df_copy


In [12]:
def remove_stop_words(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    df_copy = dataframe.copy()
    stop_words = set(stopwords.words('english'))
    df_copy[column] = df_copy[column].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
    return df_copy


In [13]:
def remove_punctuation(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    df_copy = dataframe.copy()
    df_copy[column] = df_copy[column].str.replace(r'[^\w\s]', '', regex=True)
    return df_copy


In [14]:
def filter_words(dataframe: pd.DataFrame, column: Union[str, int], remove=['fword']):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    df_copy = dataframe.copy()
    remove_set = set(remove)
    df_copy[column] = df_copy[column].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in remove_set]))
    return df_copy


In [15]:
def expand_contractions(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    contraction_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    def expand_text(text):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contraction_re.sub(replace, text)
    
    df_copy = dataframe.copy()
    df_copy[column] = df_copy[column].apply(lambda x: expand_text(x))
    return df_copy


In [16]:
def lemmatization(dataframe: pd.DataFrame, column: Union[str, int]):
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    assert pd.api.types.is_string_dtype(dataframe[column]), f"Column '{column} is not string type'"
    
    df_copy = dataframe.copy()
    lemmatizer = WordNetLemmatizer()
    df_copy[column] = df_copy[column].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    return df_copy


### Test


In [18]:
df = pd.read_csv("dataset/movies.csv")
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
filter_words(df, 'Title', ['gun'])


Unnamed: 0,Title,Director,Genre,Release Date,Duration,Rating
0,Key entire popular.,Anthony Becker,Horror,1981-05-12,102,6.8
1,husband reveal.,William Johnson,Documentary,2016-06-13,92,7.6
2,Crime cover.,Amy Le,Drama,1988-03-22,144,5.5
3,Challenge.,Andrea Martinez,Romance,2013-04-01,161,2.0
4,Close study.,Michael Rodgers,Fantasy,2012-10-18,177,3.7
...,...,...,...,...,...,...
29995,Daughter.,Richard Nelson,Romance,2007-03-12,177,8.0
29996,Simply.,Jeffrey Hatfield,Fantasy,2011-08-16,126,5.7
29997,Also authority nor.,Ryan Brown,Action,1998-05-07,73,4.9
29998,Total report upon.,Melissa Stephenson,Comedy,2008-06-06,145,6.9
