# Text_Preprocessing_Notebook

# Importing Libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.summarization.textcleaner import split_sentences

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

from textblob import TextBlob, Word, Blobber
# to install textblob in your conda packages:
# 1. go to anaconda prompt
# 2. cd Anaconda3>Scripts>conda install -c conda-forge textblob

import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
from nltk import word_tokenize,sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farnaz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\farnaz\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farnaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Cleaning

1. Tokenization
2. Punctuation removal
3. Removing Stop Words
4. Stemming words
5. Other preprocessing 

In [4]:
# a sample text to check the cleaning process:
text = 'GREAT Boutique, GREAT  service & GREAT  clothing line.....If your looking for unique,different and adorable dresses...this is the place to go. I found this boutique 2 years ago when i was walking back to my car after i had my hair done for an a very special  party and   wondering what to wear... and  then by accident I was in frontof the Kishas Studio What a great accident.......I got my perfect dress and I have been a happy customer since then:)'
text

'GREAT Boutique, GREAT  service & GREAT  clothing line.....If your looking for unique,different and adorable dresses...this is the place to go. I found this boutique 2 years ago when i was walking back to my car after i had my hair done for an a very special  party and   wondering what to wear... and  then by accident I was in frontof the Kishas Studio What a great accident.......I got my perfect dress and I have been a happy customer since then:)'

In [5]:
#print(string.punctuation)
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
#define a function to fully clean the text:
def text_cleaning(text):
    """
    This function cleans a block of text.
    Input:text = the text to be cleaned.
    Output: the text stripped of punctuation and made lowercase.
    """
    # u'\xa0' represents a non-breaking space in the text block that needs to be removed.
    text = text.replace(u'\xa0', u' ')
    
    #remove multiple fullstops and make a single fullstop
    text = re.sub('\.+', '. ', text)
    
    #the code line "text = re.sub('\.+', ' ', text)" will remove the "." itself too    
    #text = text.replace('...',' ')
    #text= text.replace('..','. ')
   
    
    #remove multiple spaces and make a single space.
    text = re.sub(' +', ' ', text)
    
    #remove all tokens that are not alphabetic
    text = re.sub(r'\d+', '', text)
    
    #normalization
    text= text.lower()
    
    #remove punctuations
    #punctuation marks, add . if needs be
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    #traverse the given string and if any punctuation marks occur replace it with null 
    for i in text: 
        if i in punctuations: 
            text = text.replace(i, "") 
                    
    return text

    #if removing stopwords and stemming is required:
    
    #tokenize the text,split it into tokens/words
    #tokens = word_tokenize(text)
    
    #remove stopwords
    #tokens = [token for token in tokens if not token in stop_words]
    
    #stem words
    #porter = PorterStemmer()
    #stemmed = [porter.stem(token) for token in tokens] 

    #return the cleaned text in a sentence format, and normalize them all with lowercase method.
    #cleaned_text=' '.join([''.join(token).lower() for token in tokens])

    #return clened_text
    
clean_text= text_cleaning(text)
clean_text

'great boutique great service  great clothing line. if your looking for uniquedifferent and adorable dresses. this is the place to go. i found this boutique  years ago when i was walking back to my car after i had my hair done for an a very special party and wondering what to wear. and then by accident i was in frontof the kishas studio what a great accident. i got my perfect dress and i have been a happy customer since then'

In [7]:
def organize_review(text):
    """
    This method will return the review text in a sentence format with the beginning of each
    sentence capitalized.
    """
    text = split_sentences(text)
    
    #considering that sentence ends with period, apastrophe or other Separation punctuation marks,?,!: 
    return ' '.join([sentence.capitalize() for sentence in text])
    #return ''.join([''.join(sentence).capitalize() for sentence in text])
    
organize_review(clean_text)

'Great boutique great service  great clothing line. If your looking for uniquedifferent and adorable dresses. This is the place to go. I found this boutique  years ago when i was walking back to my car after i had my hair done for an a very special party and wondering what to wear. And then by accident i was in frontof the kishas studio what a great accident. I got my perfect dress and i have been a happy customer since then'

In [8]:
def get_all_tokens(text):
    return [token for sentence in text for token in sentence]

print(get_all_tokens(text))

['G', 'R', 'E', 'A', 'T', ' ', 'B', 'o', 'u', 't', 'i', 'q', 'u', 'e', ',', ' ', 'G', 'R', 'E', 'A', 'T', ' ', ' ', 's', 'e', 'r', 'v', 'i', 'c', 'e', ' ', '&', ' ', 'G', 'R', 'E', 'A', 'T', ' ', ' ', 'c', 'l', 'o', 't', 'h', 'i', 'n', 'g', ' ', 'l', 'i', 'n', 'e', '.', '.', '.', '.', '.', 'I', 'f', ' ', 'y', 'o', 'u', 'r', ' ', 'l', 'o', 'o', 'k', 'i', 'n', 'g', ' ', 'f', 'o', 'r', ' ', 'u', 'n', 'i', 'q', 'u', 'e', ',', 'd', 'i', 'f', 'f', 'e', 'r', 'e', 'n', 't', ' ', 'a', 'n', 'd', ' ', 'a', 'd', 'o', 'r', 'a', 'b', 'l', 'e', ' ', 'd', 'r', 'e', 's', 's', 'e', 's', '.', '.', '.', 't', 'h', 'i', 's', ' ', 'i', 's', ' ', 't', 'h', 'e', ' ', 'p', 'l', 'a', 'c', 'e', ' ', 't', 'o', ' ', 'g', 'o', '.', ' ', 'I', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 't', 'h', 'i', 's', ' ', 'b', 'o', 'u', 't', 'i', 'q', 'u', 'e', ' ', '2', ' ', 'y', 'e', 'a', 'r', 's', ' ', 'a', 'g', 'o', ' ', 'w', 'h', 'e', 'n', ' ', 'i', ' ', 'w', 'a', 's', ' ', 'w', 'a', 'l', 'k', 'i', 'n', 'g', ' ', 'b', 'a', 'c', 'k',

In [9]:
#[' '.join([word for word in sent_tokenize(text) if word not in stopwords])]

# vocab = set(w.lower() for w in nltk.corpus.words.words())
# [' '.join([word for word in sentence.split(sep=" ") if word in vocab]) for sentence in text]

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
        
def remove_stopwords(text, stop_words):
    """
    This method removes stopwords from the review text.
    INPUT: stopwords = List of stopwords to be removed.
    """
    tokens = word_tokenize(text)
    #remove stopwords
    tokens = [token for token in tokens if not token in stop_words]
    #return the cleaned text in a sentence format.
    return ' '.join([''.join(token) for token in tokens])

remove_stopwords(text, stop_words)

'GREAT Boutique , GREAT service & GREAT clothing line ... ..If looking unique , different adorable dresses ... place go . I found boutique 2 years ago walking back car hair done special party wondering wear ... accident I frontof Kishas Studio What great accident ... ... .I got perfect dress I happy customer since : )'

In [10]:
#use the sentiment function and returns two properties - polarity and subjectivity.
text_sentiment= TextBlob(clean_text)
print (text_sentiment)
text_sentiment.sentiment

#sentiment = TextBlob(clean_text).sentiment
#sentiment
#polarity = TextBlob(clean_text).sentiment.polarity
#polarity

great boutique great service  great clothing line. if your looking for uniquedifferent and adorable dresses. this is the place to go. i found this boutique  years ago when i was walking back to my car after i had my hair done for an a very special party and wondering what to wear. and then by accident i was in frontof the kishas studio what a great accident. i got my perfect dress and i have been a happy customer since then


Sentiment(polarity=0.6626984126984127, subjectivity=0.7492063492063492)

In [None]:
#Define Review Class
'''
Define real words according to nltk corpus english vocabulary.
This function is used for the remove_gibberish() method in the Review class.
'''
vocab_en = set(w.lower() for w in nltk.corpus.words.words())


'''
Define punctuations according to nltk corpus.
This function is used for the text_cleaning method in the review class.
'''
punctuations = string.punctuation


'''
Define stopwords
'''
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 


'''
Define sentence
'''
sentence = split_sentences(text)
#or:sentence = sent_tokenize(text)

