In [34]:
import PyPDF2
# First Import the libraries to use. I will keep all of the imports in the top box.
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from itertools import chain

import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
import string
import inflect


from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Import Counter class of collection containers library (to analyze the data):
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer

import gensim
import spacy



In [35]:
# Define functions that will be used here
# https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/
def text_lowercase(text):
    return text.lower()

# Remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove punctuation
def remove_punctuation(text):
    return re.sub(r',|\.|\:|;|-|\'|/|&|!|\?|\(|\)|\+|@|<|>|#|~|=|\$|\*|[|]|{|}','',text)

# remove whitespace from text
def remove_whitespace(text):
    return  " ".join(text.split())


# remove stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

# stem words in the list of tokenized words
# Instantiate an object of the PorterStemmer() class:
stemmer = PorterStemmer()
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

# lemmatize string
# Instantiate an object of the WordNetLemmatizer() class:
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# toeknize words
def toeknize_words(text):
    return word_tokenize(text)

# convert number into words
# Instantiate an object of the inflect.engine() class:
p = inflect.engine()
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []
 
    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
 
        # append the word as it is
        else:
            new_string.append(word)
 
    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    pos_counts["n"] = len([item for item in probable_part_of_speech if item.pos()=="n"])
    pos_counts["v"] = len([item for item in probable_part_of_speech if item.pos()=="v"])
    pos_counts["a"] = len([item for item in probable_part_of_speech if item.pos()=="a"])
    pos_counts["r"] = len([item for item in probable_part_of_speech if item.pos()=="r"])
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech



In [36]:
# copy and paste the text here
content = "We are happy that you have chosen to be a part of our community. To maintain property values\
and to ensure that Breakwater Cove continues to be a desirable place to live, we all must do\
our part. To that end all residents must have their lawn mowed, edged and weeded to ensure a\
neat appearance."

In [37]:
#convert each sentence to a list of words. split on whitespace
sentences = sent_tokenize(content)
sentences

['We are happy that you have chosen to be a part of our community.',
 'To maintain property valuesand to ensure that Breakwater Cove continues to be a desirable place to live, we all must doour part.',
 'To that end all residents must have their lawn mowed, edged and weeded to ensure aneat appearance.']

In [38]:
# form dataframe from sentences. 
df = pd.DataFrame(sentences, columns=['sentences'])
df

Unnamed: 0,sentences
0,We are happy that you have chosen to be a part...
1,To maintain property valuesand to ensure that ...
2,To that end all residents must have their lawn...


In [39]:
# Remove punctiation
sentences_n0_punct = [remove_punctuation(sentence) for sentence in sentences]
sentences_n0_punct

['We are happy that you have chosen to be a part of our community',
 'To maintain property valuesand to ensure that Breakwater Cove continues to be a desirable place to live we all must doour part',
 'To that end all residents must have their lawn mowed edged and weeded to ensure aneat appearance']

In [40]:
# Make Lower case
content_lower_no_punct = [text_lowercase(sentence) for sentence in sentences_n0_punct]
content_lower_no_punct

['we are happy that you have chosen to be a part of our community',
 'to maintain property valuesand to ensure that breakwater cove continues to be a desirable place to live we all must doour part',
 'to that end all residents must have their lawn mowed edged and weeded to ensure aneat appearance']

In [41]:
# Remove the white space
content_lower_no_punct_white = [remove_whitespace(sentence) for sentence in content_lower_no_punct]
content_lower_no_punct_white

['we are happy that you have chosen to be a part of our community',
 'to maintain property valuesand to ensure that breakwater cove continues to be a desirable place to live we all must doour part',
 'to that end all residents must have their lawn mowed edged and weeded to ensure aneat appearance']

In [42]:
# Convert the numbers to words
content_text = [convert_number(sentence) for sentence in content_lower_no_punct]
content_text

['we are happy that you have chosen to be a part of our community',
 'to maintain property valuesand to ensure that breakwater cove continues to be a desirable place to live we all must doour part',
 'to that end all residents must have their lawn mowed edged and weeded to ensure aneat appearance']

In [43]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [44]:
# Function to take SIA score and return "Positive" or "Negative"
def sentiment_calc(column):
    """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.
    """
    temp_dict = sia.polarity_scores(column)
    for row in column:
        for item in row:
            if (temp_dict["compound"] > 0):
                item = "Positive"
            elif (temp_dict["compound"] <= 0):
                item = "Negative"
        
        # return item
        return temp_dict["compound"]

In [45]:
# Run the above function on every line in the dataframe
content_text_sentiment = [sentiment_calc(sentence) for sentence in content_text]
content_text_sentiment

[0.5719, 0.5994, 0.3818]

In [46]:
# Form an array to display results as "Positive" or "Negative"
content_text_sentiment_classify = []

for item in content_text_sentiment:
    if (item > 0):
        item_to_append = "Positive"
    elif (item <= 0):
        item_to_append = "Negative"
    content_text_sentiment_classify.append(item_to_append)

content_text_sentiment_classify

['Positive', 'Positive', 'Positive']

In [47]:
content_text

['we are happy that you have chosen to be a part of our community',
 'to maintain property valuesand to ensure that breakwater cove continues to be a desirable place to live we all must doour part',
 'to that end all residents must have their lawn mowed edged and weeded to ensure aneat appearance']

In [48]:
# Compare original with score forming new data frame for export
# df['content_text'] = content_text
df['content_text_sentiment_classify'] = content_text_sentiment_classify
df['content_text_sentiment'] = content_text_sentiment

df

Unnamed: 0,sentences,content_text_sentiment_classify,content_text_sentiment
0,We are happy that you have chosen to be a part...,Positive,0.5719
1,To maintain property valuesand to ensure that ...,Positive,0.5994
2,To that end all residents must have their lawn...,Positive,0.3818


In [49]:
# Export to csv
# df.to_csv('pressure_wash_2.csv')