In [1]:
import string
import pandas as pd
import codecs
import re

In [2]:
data = pd.read_csv("mscproject\data\emergent\my_claims_csv.csv")

In [3]:
data.head()

Unnamed: 0,claimId,claimTruthiness,claimHeadline,articleId,articleVersion,articleHeadline,articleHeadlineStance
0,1,unknown,"Meijer is offering $100 off ""Back to School"" c...",0,1,$100 Meijer Coupon - Snopes.com,for
1,1,unknown,"Meijer is offering $100 off ""Back to School"" c...",1,1,Fake Meijer $100 back-to-school coupon goes vi...,for
2,1,unknown,"Meijer is offering $100 off ""Back to School"" c...",2,1,$100 Meijer Coupon - Hoax - Trendolizer,for
3,1,unknown,"Meijer is offering $100 off ""Back to School"" c...",3,1,[$100 OFF] Meijer Coupon & Deals April 2017 | ...,for
4,1,unknown,"Meijer is offering $100 off ""Back to School"" c...",4,1,$10 off $100 in Visa Gift Cards at Meijer - Fr...,for


In [4]:
def _clean_text(data):
    data = data.copy()
    # Remove specific articles with known issues
    to_drop = [
        '58eacfb0-7af4-11e4-b794-93ed794d7b91',  # Contains: "Sorry - this page has been removed."
        '8ac9a380-c383-11e4-9435-a96703525a9e'   # Contains only phrase: "Jonathan S. Geller"
    ]
    data = data[~data.articleId.isin(to_drop)]

    # Strip Claim: prefix from claim headline
    def strip_claim_suffix(s):
        if s.startswith('Claim:'):
            return s[7:]
        else:
            return s
    data['claimHeadline'] = data.claimHeadline.apply(strip_claim_suffix)

    # Collection of funcs to apply to claim and article, in order
    funcs = []

    # Convert chars to UTF
    def convert_to_utf(s):
        #return s.decode('utf8')
        return s.decode('utf8', errors = 'replace')

    funcs.append(convert_to_utf)

    # Strip words containing article source prefix
    _strip_words = ['REPORT', 'REPORTS', 'PODCAST',
                    'CNN', 'CNBC', 'Net Extra', 'WSJ']

    def strip_source_prefix(s):
        for w in _strip_words:
            s = re.sub(w + ':', '', s, flags=re.IGNORECASE)
        return s

    funcs.append(strip_source_prefix)

    # Clean up some unicode quotations poop
    utf_quotation_marks = [u'\u2032', u'\u2019', u'\u2018',
                           u'\u201C', u'\u201D']

    def convert_quotations(s):
        for c in utf_quotation_marks:
            s = s.replace(c, "'")
        return s

    funcs.append(convert_quotations)

    # Re-introduce 's and 't which were encoded as ?s and ?t in some cases
    def convert_bad_apostrophe(s):
        s = s.replace('?s', '\'s')
        return s.replace('?t', '\'t')

    funcs.append(convert_bad_apostrophe)

    # Drop remaining non-ascii stuff we don't want
    def drop_non_ascii(s):
        return s.encode('utf-8').decode('ascii', 'ignore')

    funcs.append(drop_non_ascii)

    # Expand contractions
    def expand_contractions(s):
        for c, e in get_contraction_mappings().items():
            s = re.sub(c, e, s, flags=re.IGNORECASE)
        return s

    # funcs.append(expand_contractions)

    my_punctuation = ''.join(set(string.punctuation).difference(['?', ',', '.', ':', '-', '\'']))

    # Strip out any spurious punctuation
    # This should preserve apostrophes, comma, question-mark and full-stop.
    def strip_punctuation(s):
        s = [w.translate(dict.fromkeys(map(ord, my_punctuation))) for w in s.split(" ")]
        s = filter(None, s)
        return ' '.join(s)

    funcs.append(strip_punctuation)

    # Strip out any nested quotation marks
    def strip_internal_quotations(s):
        s = re.sub(r"^'", '', s)
        s = re.sub(r"\s'", " ", s)
        s = re.sub(r"'\s", " ", s)
        s = re.sub(r"'$", '', s)
        s = re.sub(r"[:.,;]'", lambda m: m.group(0)[0], s)
        s = re.sub(r"'[:.,;]", lambda m: m.group(0)[1], s)
        return s

    funcs.append(strip_internal_quotations)

    # Fix problem with words like ?abc?
    def drop_bracketing_question_marks(s):
        s = re.sub(r'\?\w[\w\s]*\?', lambda m: m.group(0)[1: -1], s)
        return s

    funcs.append(drop_bracketing_question_marks)

    for f in funcs:
        print 'Applying function:', f.__name__
        data['articleHeadline'] = data.articleHeadline.apply(f)
        data['claimHeadline'] = data.claimHeadline.apply(f)

    return data

In [5]:
data = _clean_text(data)

Applying function: convert_to_utf
Applying function: strip_source_prefix
Applying function: convert_quotations
Applying function: convert_bad_apostrophe
Applying function: drop_non_ascii
Applying function: strip_punctuation
Applying function: strip_internal_quotations
Applying function: drop_bracketing_question_marks


In [6]:
data.head()

Unnamed: 0,claimId,claimTruthiness,claimHeadline,articleId,articleVersion,articleHeadline,articleHeadlineStance
0,1,unknown,Meijer is offering 100 off Back to School coup...,0,1,100 Meijer Coupon - Snopes.com,for
1,1,unknown,Meijer is offering 100 off Back to School coup...,1,1,Fake Meijer 100 back-to-school coupon goes vir...,for
2,1,unknown,Meijer is offering 100 off Back to School coup...,2,1,100 Meijer Coupon - Hoax - Trendolizer,for
3,1,unknown,Meijer is offering 100 off Back to School coup...,3,1,100 OFF Meijer Coupon Deals April 2017 HotDeal...,for
4,1,unknown,Meijer is offering 100 off Back to School coup...,4,1,10 off 100 in Visa Gift Cards at Meijer - Freq...,for


In [7]:
data.to_csv("my_claims_csv.csv")