In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import join, dirname
import nltk
import spacy
#>>>python3 -m spacy download en
spacy_nlp = spacy.load('en_core_web_sm')

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# Build directory paths
BASE_DIR = os.path.dirname(os.path.realpath('__file__'))
# print(BASE_DIR)

In [9]:
# Build file paths to be universal on different OS
data = pd.read_csv(os.path.join(BASE_DIR, 'data', 'combined_data.csv'), sep = ";")

In [10]:
print(data.head())

                                                text  label
0  Image copyright Getty Images\nOn Sunday mornin...      1
1  LONDON (Reuters) - “Last Flag Flying”, a comed...      1
2  The feud broke into public view last week when...      1
3  MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...      1
4  Country singer Jason Aldean, who was performin...      1


In [11]:
print(data.label.value_counts())

0    18013
1    12219
Name: label, dtype: int64


In [12]:
data.drop_duplicates(subset ="text", 
                     keep = False, inplace = True)
print(data.label.value_counts())

0    10110
1     5569
Name: label, dtype: int64


In [8]:
# Split Train Test Set
X, y = data.text, data.label
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                  stratify=y, 
                                                  random_state=0, 
                                                  test_size=0.1, shuffle=True)

In [13]:
from nltk.tokenize.toktok import ToktokTokenizer
stopword_list = spacy.lang.en.stop_words.STOP_WORDS
tokenizer = ToktokTokenizer()
stopword_list.remove('no')
stopword_list.remove('not')


In [14]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [17]:
from lib.contractions import CONTRACTION_MAP
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [18]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [19]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [20]:
def lemmatize_text(text):
    text = spacy_nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [21]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [29]:
#function to preprocess each row of text
def process_text(doc):
    if not doc:
        return doc
    # Comment out whichever that is not required
    
    doc = remove_accented_chars(doc)
    doc = remove_special_characters(doc)
    #doc = simple_stemmer(doc)
    doc = lemmatize_text(doc)
    #doc = remove_stopwords(doc)
    
    return doc
    

In [34]:
#data.text=data.text.apply(process_text)
#print(data.head(10))

In [28]:
print(process_text("Well this was fun! What do you think? 123#@! I'm going to try this"))

well thi wa fun what do you think 123 I be go to tri thi


In [30]:
data.text = data.text.map(process_text)

In [31]:
print(data.head())

                                                text  label
0  imag copyright getti imag On sunday morn donal...      1
1  london reuter last flag fli a comedydrama abou...      1
2  the feud break into public view last week when...      1
3  mexico citi reuter egypt cheiron hold limit wi...      1
6  In 2012 kansa lawmak lead by gov sam brownback...      1


In [36]:
data.to_csv(os.path.join(BASE_DIR,'data','processed.csv'), sep=';', encoding='utf-8', index=False)