In [25]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

In [26]:
df = pd.read_excel('./data/Task-2/train.xlsx')

In [34]:
def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove puncuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        nlp = spacy.load('en_core_web_sm')
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

def clean_pandas(df, colunm_name='text', stem="Stem"):
    df[colunm_name] = df[colunm_name].apply(lambda x: clean_string(x, stem))
    return df

In [35]:
df = pd.read_excel('./data/Task-2/train.xlsx')
# clean
df = clean_pandas(df, colunm_name='text', stem="Stem")
df.to_excel('./data/Task-2/train_processed_stem.xlsx', columns=['text','label'])

In [28]:
df['text'] = df['text'].apply(lambda x: clean_string(x, stem='Stem'))

In [29]:
df.to_excel('./data/Task-2/train_processed_stem.xlsx', columns=['text','label'])

In [31]:
df

Unnamed: 0,text,label
0,fiskar strong portfolio intern brand includ fi...,1
1,metalszinc surg pct glencor cut output fuell m...,1
2,accord scanfil demand telecommun network produ...,-1
3,db launch new bank api develop platform india ...,1
4,theodosopoulo say tellab could valu nokia siem...,1
...,...,...
4333,airvana umt home base station femto cell use s...,1
4334,malton net profit jump four time gain revok deal,1
4335,accord ceo kai telann compani newspap achiev g...,1
4336,addit cramo peab sign exclus fiveyear rental a...,1


In [32]:
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from sklearn.base import BaseEstimator, TransformerMixin

class CleanText(BaseEstimator, TransformerMixin):
    def __init__(self, stem="None"):
        self.stem = stem

    def clean_string(self, text):
        final_string = ""

        # Make lower
        text = text.lower()

        # Remove line breaks
        text = re.sub(r'\n', '', text)

        # Remove puncuation
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)

        # Remove stop words
        text = text.split()
        useless_words = nltk.corpus.stopwords.words("english")
        useless_words = useless_words + ['hi', 'im']

        text_filtered = [word for word in text if not word in useless_words]

        # Remove numbers
        text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

        # Stem or Lemmatize
        if self.stem == 'Stem':
            stemmer = PorterStemmer() 
            text_stemmed = [stemmer.stem(y) for y in text_filtered]
        elif self.stem == 'Lem':
            lem = WordNetLemmatizer()
            text_stemmed = [lem.lemmatize(y) for y in text_filtered]
        elif self.stem == 'Spacy':
            nlp = spacy.load('en_core_web_sm')
            text_filtered = nlp(' '.join(text_filtered))
            text_stemmed = [y.lemma_ for y in text_filtered]
        else:
            text_stemmed = text_filtered

        final_string = ' '.join(text_stemmed)

        return final_string

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_cleaned = [self.clean_string(text) for text in X]
        return X_cleaned