In [None]:
# Install a pip package in the current Jupyter kernel
import sys

# !conda install --yes --prefix {sys.prefix} dill
# !conda install --yes --prefix {sys.prefix} spacy
# !conda install --yes --prefix {sys.prefix} numpy
# !conda install --yes --prefix {sys.prefix} spacy
# !{sys.executable} -m spacy download en
# !{sys.executable} -m pip install contractions
# !conda install --yes --prefix {sys.prefix} -c glemaitre imbalanced-learn
# !conda install --yes --prefix {sys.prefix} gensim

# import nltk
# nltk.download()



In [None]:
import os
import pandas as pd
import dill as pickle
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import contractions
from bs4 import BeautifulSoup
import unicodedata
from string import punctuation
import re
import numpy as np
import math
from collections import defaultdict
pd.options.display.max_colwidth = 200

In [None]:
# These might need to stay global until I can figure out how to include in the sklearn Pipeline.
# nlp = spacy.load('en', parse = False, tag=False, entity=False)
nlp = spacy.load('en')
tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('english')
#we want the negatives
stopword_list.remove('no')
stopword_list.remove('not')

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

def replace_accented_chars(text):
    #The normal form KD (NFKD) will apply the compatibility decomposition, 
    #i.e. replace all compatibility characters with their equivalents (from python.org). 
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text, contraction_mapping=contractions.contractions_dict):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        if contraction_mapping.get(match):
            expanded_contraction = contraction_mapping.get(match)
        else:
            expanded_contraction = contraction_mapping.get(match.lower())                    
        if expanded_contraction:
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction
        else:
            pass
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
    
def normalize_corpus(doc, html_stripping=True, contraction_expansion=True, text_lemmatization=True, 
                     stopword_removal=True):
    
    def get_profanity():
        file_path = os.path.join(os.getcwd(),"corpora","profanity.csv")
        profanity = set(pd.read_csv(file_path).values.ravel().tolist())
        return profanity

    #url regex
    url_re = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
    #email address regex
    email_re = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
    #phone number regex
    phone_re = re.compile(r'(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?')
    #ssn regex
    ssn_re = re.compile(r'^(?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}[-]?(?!00)\d{2}[-]?(?!0{4})\d{4}$')
    #profanity regex
    profanity_regex = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, get_profanity())))
    
    
    doc = doc.lower()
    
    doc = profanity_regex.sub("criticaster", doc)
    doc = email_re.sub('blatherskite',doc)
    doc = phone_re.sub('blatherskite',doc)
    doc = ssn_re.sub('blatherskite',doc)
    doc = url_re.sub('blatherskite',doc)
    
    # strip HTML
    if html_stripping:
        doc = strip_html_tags(doc)
    # expand contractions    
    if contraction_expansion:
        doc = expand_contractions(doc)
    # at least three characters long, cannot contain a number, and no more than 17 chars long
    doc = re.findall(r'\b[a-z][a-z][a-z]+\b',doc)
    doc = ' '.join(w for w in doc if w != 'nan' and len(w) <= 17)
    # lemmatize text
    if text_lemmatization:
        doc = lemmatize_text(doc)
    # remove stopwords
    if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=True)
    if len(doc) == 0:
        doc = "spam"    
    return doc

In [None]:
class NewData():
    best_model_path = os.path.join('best_estimators','label_best_estimator.pkl')
    
    
    def __init__(self, recent_data_path, new_data_path, normalize_corpus):
        self.recent_data_path = recent_data_path
        self.new_data_path = new_data_path
        self.normalize_corpus = normalize_corpus
    
    def get_data(self):
        recent_data = pd.read_excel(self.recent_data_path)
        last_date = pd.to_datetime(recent_data['Date']).max()
        
        new_data = pd.read_csv(new_data_path)
        new_data = new_data.rename({'Please tell us what you value most about this website.':'Value Comment'},axis=1)
        new_data['Date'] = pd.to_datetime(new_data['Date'])
        new_data = new_data[new_data['Date'] > last_date].dropna(subset=['Value Comment'])
        new_data['Normalized Value Comment'] = new_data['Value Comment'].apply(self.normalize_corpus)
        
        return new_data
        
    def predict(self):
        with open(self.best_model_path, 'rb') as f: 
            pickled_model = pickle.load(f)  
        
        new_data = self.get_data()
        preds = pickled_model.predict(new_data)
        pred_probs = pickled_model.predict_proba(new_data)
        

        new_data['Predictions'] = preds
        new_data['Ham Prediction Probability'] = 0
        new_data['Spam Prediction Probability'] = 0
        new_data[['Ham Prediction Probability', 'Spam Prediction Probability']] = pd.DataFrame(pred_probs).values
        new_data['Prediction Probabilities Delta'] = abs(new_data['Ham Prediction Probability'] - new_data['Spam Prediction Probability'])
        
        classifier_results = new_data.sort_values(by='Prediction Probabilities Delta')[['Value Comment','Predictions','Prediction Probabilities Delta']]
        
        results_path = os.path.join('results','ClassificationResults.xlsx')
        writer = pd.ExcelWriter(results_path)
        classifier_results.to_excel(writer,'Classification Results')
        writer.save()
        return classifier_results

In [None]:
recent_data_path = os.path.join('labeled_data','labeled_values.xlsx')
#insert name of newest data file below
new_data_path = os.path.join('unlabeled_data','201806121828-USA.gov_Cus-1.1.csv')
nd = NewData(recent_data_path, new_data_path, normalize_corpus)
nd.best_model_path = os.path.join('best_estimators','label_best_estimator.pkl')

In [None]:
new_data = nd.get_data()

In [None]:
classifier_results = nd.predict()

In [None]:
classifier_results