In [None]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint # Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel # spaCy for preprocessing
import spacy # Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

In [None]:
spam = pd.read_csv('C:\\Users\\danih\\OneDrive\\Desktop\\Digital Futures Academy\\Capstone\\spam.csv')

In [None]:
X = spam['v2']
y = spam['v1']

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.2,
                                                   random_state = 25)

In [None]:
cv = CountVectorizer()
features = cv.fit_transform(X_train)

In [None]:
spam_filter = svm.SVC()
spam_filter.fit(features,y_train)

In [None]:
features_test = cv.transform(X_test)
print("Accuracy: {}".format(spam_filter.score(features_test,y_test)))

In [None]:
locations = ['Buxton', 'Bollington', 'Biddulph', 'Leek', 'Cheadle']
base_url = 'https://www.{}hearingcentre.co.uk/wp-json/wp/v2/submissions'

In [None]:
def fetch_json_data(url):
    """Fetch JSON data from the given URL."""
    response = requests.get(url)
    return response.json()

In [None]:
def generate_location_data(locations, base_url):
    """
    Generate a dictionary with location names as keys and JSON data as entries.
    
    Argument:
    locations - List of location names.
    base_url - Base URL for fetching JSON data, with '{}' placeholder for location name.
    
    Returns:
    dict - Dictionary with location names as keys and JSON data as entries.
    """
    hearing_centre_data = {}
    
    for location in locations:
        # Replace placeholder with the actual location name
        url = base_url.replace('{}', location.lower())
        # Request permission
        response = requests.get(url, verify=False)
        # Store the data
        data = response.json()
        # Add to the dictionary
        hearing_centre_data[location] = data
    
    return hearing_centre_data

In [None]:
hearing_centre_data = generate_location_data(locations, base_url)

In [None]:
hearing_centre_data;

In [None]:
def process_messages(location_data):
    """
    Process the JSON data to extract messages and clean them.
    
    Argument:
    location_data - Dictionary with location names as keys and JSON data as entries.
    
    Returns:
    df - DataFrame containing the post date, post content message, email, and location.
    """
    messages_data = []
    email_map = {}
    email_counter = 1
    
    for location, data in location_data.items():
        # Loop through each item in the JSON data
        for item in data:
            # Check if 'post_content' key exists and is a string
            if 'post_content' in item and isinstance(item['post_content'], str):
                post_content = item['post_content'].split('\n')
                
                # Extract the email 
                email = post_content[1] if len(post_content) > 1 else 'N/A'
                
                # Anonymise the email
                if email not in email_map:
                    email_map[email] = f'anonymised_email_{email_counter}'
                    email_counter += 1
                anonymized_email = email_map[email]
                
                # Initialize an empty message
                message = []
                
                # Loop through the post_content to capture the message
                for i in range(3, len(post_content)):
                    if post_content[i].isdigit():
                        break
                    message.append(post_content[i])
                
                # Join the message parts and clean them
                message_content = '\n'.join(message).replace('\r', '').replace('\n', ' ').strip()
                
                # Extract and convert post date if it exists
                # Use 'N/A' if post_date is not available
                # post_date = pd.to_datetime(item.get('post_date', 'N/A')).date()
                
                # Extract and convert post date if it exists
                post_date_str = item.get('post_date', None)
                if post_date_str:
                    try:
                        post_date = pd.to_datetime(post_date_str)
                    except Exception as e:
                        post_date = None
                else:
                    post_date = None
                
                # Append the extracted data to the messages_data list
                messages_data.append({
                    'post_date': post_date,
                    'post_content': message_content,
                    'email': anonymized_email,
                    'location': location
                })
    
    # Create a DataFrame from the messages_data list
    df = pd.DataFrame(messages_data)
    
    # Normalize 'post_date' to display date as 'YYYY-MM-DD'
        # Timestamp is converted to '00:00:00', hence it's not displayed
    df['post_date'] = df['post_date'].dt.normalize()
    
    return df

In [None]:
df = process_messages(hearing_centre_data)

In [None]:
df.dtypes

In [None]:
df.count()

In [None]:
def clean_data(df, cv, spam_filter):
    """
    Clean the DataFrame by removing duplicate entries and filtering out spam.

    Argument:
    df - The DataFrame to clean, with 'post_content' and 'email' columns.
    cv - The CountVectorizer used to transform the text data.
    spam_filter - The trained SVM model to predict spam.

    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Remove duplicates based on 'post_content' and 'email' columns
    df = df.drop_duplicates(subset=['post_content', 'email'], keep='first')
    
    # Remove messages that contain links (http or https)
    df = df[~df['post_content'].str.contains(r'http://|https://',
                                             case=False,
                                             na=False)]

    # Transform the 'post_content' using the CountVectorizer
    post_content_features = cv.transform(df['post_content'])

    # Predict spam using the spam_filter model
    spam_predictions = spam_filter.predict(post_content_features)

    # Filter out entries labeled as spam
    df = df[spam_predictions == 'ham']

    return df

In [None]:
df_clean = clean_data(df, cv, spam_filter)

In [None]:
df_clean.count()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),
                                             deacc = True))  # deacc=True removes punctuations

In [None]:
data_words = list(sent_to_words(df_clean['post_content']))
df_token = df_clean.copy()
df_token['cleaned_post_content'] = [' '.join(words) for words in data_words]

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=5,
                               threshold=100) 
trigram = gensim.models.Phrases(bigram[data_words],
                                threshold=100) 
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm',
                 disable = ['parser',
                            'ner'])

# Lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams,
                                allowed_postags = ['NOUN',
                                                   'ADJ',
                                                   'VERB',
                                                   'ADV'])

In [None]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Generate Topic Model
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = 30, 
                                           random_state = 25,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model = lda_model,
                                     texts = data_lemmatized,
                                     dictionary = id2word,
                                     coherence = 'c_v'
                                    )

coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
#pyLDAvis.display(vis, template_type='notebook')
pyLDAvis.save_html(vis, 'hearingcentretopics.html')