# Import Libraries


In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import spacy

import pandas as pd

from wordcloud import WordCloud

## Create a function to calculate the Jaccard Similarity

In [3]:
def jaccardSimilarity(doc_1, doc_2):
    words_doc1 = set(doc_1.lower().split())
    words_doc2 = set(doc_2.lower().split())
    print(doc_1)
    print(doc_2)
    print(words_doc1)
    print(words_doc2)
    #List the unique words in a document

    intersection = words_doc1.intersection(words_doc2)
    #Find the intersection of words in doc_1 and doc_2

    union = words_doc1.union(words_doc2)
    #Find the union of words in doc_1 and doc_2

    return float(len(intersection)/len(union))
    #Calculate Jaccard Similarity score
    #using length of intersection set divided by length of union set


## Create a function to calculate the Semantic Similarity Score

In [4]:
def semanticSimilarity(doc_1, doc_2):
    spc = spacy.load('en_core_web_lg')
    spacyDoc_1 = spc(doc_1)
    spacyDoc_2 = spc(doc_2)
    return spacyDoc_1.similarity(spacyDoc_2)

## Create a function for Sentiment Analysis

In [5]:
def sentimentAnalysis(doc):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(doc)
    return sentiment

## Create function for text cleaning

In [6]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')

def cleanText(text):
    tokenizer = RegexpTokenizer(r'\w+')
    allTokens = tokenizer.tokenize(text)
    #Removes puncuation while tokenizing

    tokens = [token for token in allTokens if token not in stopwords.words('English')]
    #Removes the stop words

    tokens = [token.lower() for token in tokens]
    #Convert tokens to lowercase

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    #Lemmatizes the tokens

    text = ''
    for token in tokens:
        text = text + token + ' '
    return text
    # Create a text string from the token list



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/andrew/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/andrew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Create the functions to Load and Clean text

In [13]:
def loadText(fileName):
    f = open(fileName, 'r', encoding='utf-8')
    text = f.read()
    return text

def loadCleanText(fileName):
    text = loadText(fileName)
    return cleanText(text)

article_2 = loadCleanText('/Users/andrew/Downloads/UW courses/ECON 607/Assignment 3/badwords.txt')
commentsArticle_2 = loadCleanText('CommentsA2.txt')
#Loads the data

print("Jaccard: Article 2 vs Article 2 comments", jaccardSimilarity(article_2, commentsArticle_2))
#performs the Jaccard similarity calculation

print("Semantic: Article 2 vs Article 2 comments", semanticSimilarity(article_2, commentsArticle_2))
#performs the Semantic similarity calculation

print("Article 2", sentimentAnalysis(article_2))
print(type(sentimentAnalysis(article_2)))
#performs the Semantic Intensity calculation
#prints the type of the semantic calculation

comments = loadText('commentsA2.txt').splitlines()
for comment in comments:
    comment = cleanText(comment)
    s = sentimentAnalysis(comment)
    print(comment + ',', s["compound"])

bad i hate fuck shit i hate everything everything suck i fucking hate evertything i sad i cant believe stupid idiotic 
this article tell system work i closer understanding become ash let u decompose like fruit vegetable compost pile i want supportive i need detail 100 behind i care but like normal burial cremation cost borne person want composted state bill aubuchon i funeral director we pleasure presentation katrina price question asked answered bill aubuchon it would bit direct cremation way le burial the bill passed i seen article suggesting cost 5500 i would love grow plant food gramma dogmy ex not sure i feel overall doesn happen anyway nichole welp that one way dispose body do think human remains would left compost saying thats bad would use compost fertilize cemetery marcelle baldwin give source number i assume got 4 year old new york time article we see i hope right guess least double figure good politics state may bee progressive national stage it freak little i lifelong liber

## Cooccurance script

In [85]:
def Preprocessing(corpus):
    # Convert corpus (lines of comments) to an array of comments
    documentText = []
    corpus = corpus.splitlines()
    for row in corpus:
        documentText.append(row)      
    return documentText

In [86]:
def GenerateMatrix(cleanData, fileName):
    # sklearn countvectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    # Convert a collection of text documents to a matrix of token counts
    cv = CountVectorizer(ngram_range=(1,1), stop_words = 'english', max_features=20)
    # matrix of token counts
    X = cv.fit_transform(cleanData)
    Xc = (X.T * X) # matrix manipulation
    Xc.setdiag(0) # set the diagonals to be zeroes as it's pointless to be 1
    names = cv.get_feature_names_out() # This are the entity names (i.e. keywords)
    df = pd.DataFrame(data = Xc.toarray(), columns = names, index = names)
    #print(df)
    print(fileName)
    df.to_csv(fileName)

In [97]:
def ProcessCorpus():
    fileName = "positiveComments.csv"
    with open (fileName, "r", encoding='unicode_escape') as myfile:
        data = myfile.read()
        cleanData = Preprocessing(data)
        GenerateMatrix(cleanData, "PositiveCommentsMatrix.csv")

    fileName = "negativeComments.csv"
    with open (fileName, "r", encoding='unicode_escape') as myfile:
        data = myfile.read()
        cleanData = Preprocessing(data)
        GenerateMatrix(cleanData, "NegativeCommentsMatrix.csv")

In [98]:
ProcessCorpus()

PositiveCommentsMatrix.csv
NegativeCommentsMatrix.csv
