In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> imports >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
!pip install inflect
!pip install contractions 
!pip install krovetzstemmer
import contractions
from contractionsOne import CONTRACTION_MAP

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.lancaster import LancasterStemmer
from krovetzstemmer import Stemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


import inflect
import numpy as np
import pandas as pd
import sys, setuptools, tokenize
import re, unicodedata, string  
from bs4 import BeautifulSoup

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> loading the dataset >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

pd.options.mode.chained_assignment = None
df = pd.read_csv('/content/testing.csv')
review_df = df
review_df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing null values >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

#removing null values
# counting rows after removing null values
# df.dropna(subset=['reviewText'], inplace=True)
# df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing URL >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def remove_URL(reviewText):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', str(reviewText))
df['reviewText']=df['reviewText'].apply(remove_URL)
df.head(10)

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing HTML tags >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def remove_html(reviewText):
    html=re.compile(r'<.*?>')
    return html.sub(r'',str(reviewText))
df['reviewText'] = df['reviewText'].apply(remove_html)
df.head(10)

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing square brackets and the inside of the square bracket >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def remove_between_square_brackets(reviewText):
    return re.sub('\[[^]]*\]', '', str(reviewText))
df['reviewText'] = df['reviewText'].apply(remove_between_square_brackets)
df.head(10)

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing Pictures/Tags/Symbols/Emojis >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def remove_emoji(reviewText):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', str(reviewText))

df['reviewText'] = df['reviewText'].apply(remove_emoji)
df.head(10)

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing non ascii >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

df['reviewText'] = [w.encode("ascii", "ignore").decode() for w in df['reviewText']]
df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> convert to lower case >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

df['reviewText'] = [w.lower() for w in df['reviewText']]
df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> expand contractions >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def expand_contractions(reviewText):
    contractionsPattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())),flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        firstChar = match[0]
        expandedContraction = CONTRACTION_MAP.get(match) \
            if CONTRACTION_MAP.get(match) \
            else CONTRACTION_MAP.get(match.lower())
        expandedContraction = firstChar+expandedContraction[1:]
        return expandedContraction
    reviewText = contractionsPattern.sub(expand_match, str(reviewText))
    reviewText = re.sub("'", "", reviewText)
    return reviewText

df['reviewText']=df['reviewText'].apply(expand_contractions)
df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing all punctuations >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

import string
def clear_punctuation(s):
  clear_string = ""
  for symbol in s:
    if symbol not in string.punctuation:
      clear_string += symbol
  return clear_string
df['reviewText'] = df['reviewText'].apply(clear_punctuation)
df.head(10)
# print(clear_punctuation(df))

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> tokenizing sentence >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

sentence_all = []
for index, row in df.iterrows():
  sentences_with_feature = []
  for sen in sent_tokenize(row['reviewText']):
    sentences_with_feature.append(sen)
  sentence_all.append(sentences_with_feature)

list_of_tuples = list(zip(sentence_all)) 
df_sentences = pd.DataFrame(list_of_tuples,columns = ['reviewText'])
df_sentences

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> tokenizing words >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def wordTokenize(sentence):
    tokens = [w for t in (sentence.apply(word_tokenize)) for w in t]
    return tokens

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> replacing numbers with string  >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def stringNumbers(sentence):
  p = inflect.engine()
  stringNumbers = []
  for word in sentence:
    if word.isdigit():
      # new_word = num2word.to_card(15)
      new_word = p.number_to_words(word)
      stringNumbers.append(new_word)
      
    else:
      stringNumbers.append(word)
  # stringNumbers = [w.lower() for w in removedPunctuation]
  return stringNumbers

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> removing stopwords >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

from nltk.corpus import stopwords
def stopWords(sentence):
  stop_words = set(stopwords.words('english'))
  removedStopwords = [word for word in sentence if not word in stop_words]
  return removedStopwords

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> stemming words >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def stemmers(sentence):
  #stemmer = PorterStemmer()
  # stemmer = LancasterStemmer()
  stemmer = Stemmer()
  stems = []
  for word in sentence:
      stem = stemmer.stem(word)
      stems.append(stem)
  return stems

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> lemmatize_verbs >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def lemmatizers(sentence):
  lemmatizer = WordNetLemmatizer()
  lemmas = []
  for word in sentence:
      lemma = lemmatizer.lemmatize(word, pos='v')
      lemmas.append(lemma)
  return lemmas

In [None]:
from nltk.tokenize import word_tokenize
alltokens = []
wordTokens = pd.DataFrame()
for index, row in df_sentences.iterrows():
  list_of_tuples = list(zip(row['reviewText'])) 
  df_sentences_w = pd.DataFrame(list_of_tuples,columns = ['reviewText'])
  
  wordTokens = wordTokenize(df_sentences_w['reviewText'])
  stringNumber = stringNumbers(wordTokens)
  stopword = stopWords(stringNumber)
  stemmer = stemmers(stopword)
  lemmatizer = lemmatizers(stemmer)
  alltokens.append(lemmatizer)
alltokens

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> de-tokenise >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize

detokenizeall = []
for alltokensRow in alltokens:
  reviewWordDetokenize = TreebankWordDetokenizer().detokenize(alltokensRow)
  detokenizeall.append(reviewWordDetokenize)

review_df['cleaned'] = detokenizeall
review_df

In [None]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> saving the cleaned data >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

review_df.to_csv('testing_cleaned_with_lemmatized.csv') 
review_df.to_csv('testing_cleaned_without_lemmatized.csv') 

review_df