In [1]:
# Imports

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


import numpy as np
import os
import pandas as pd
import re
import string
import regex

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import time



In [2]:
# --- Read Cleaned and Preprcessed data:

# Provide Location of cleaned csv's:
data_location_cleaned = '.\\data_cleaned_test\\' #Change path to where original .csv's are
my_directory_cleaned = os.listdir(data_location_cleaned) 
print(my_directory_cleaned)

['_cleaned_1864_test.csv', '_cleaned_1900_test.csv']


In [3]:
# Load spacy with Einglish language model

# import spacy for Natural Language Processing (language: English)
#import spacy
nlp = spacy.load("en_core_web_sm") 
print('spaCy Version: %s' % (spacy.__version__))
#spacy.explain('VBD') #'VBP'
#spacy.explain('VBN')
#test=nlp(u"TeSt")

spaCy Version: 2.1.8


In [4]:
# To add a single stopword:
#nlp.Defaults.stop_words.add("nan") 

# Check pre-defined stop words in spacy:
#print(nlp.Defaults.stop_words)

# Check if particular word is a stopward:
#print(nlp.vocab["nan"].is_stop)

In [5]:
# Import Spacy's STOP_WORDS for English language:
#from spacy.lang.en.stop_words import STOP_WORDS

#Check the STOP_WORDS:
print(STOP_WORDS) 
len(STOP_WORDS)
nlp.vocab["yet"].is_stop # Checking If A Word is a Stopword 

{'doing', 'forty', 'he', 'already', 'him', 'call', 'thence', 'say', 'its', 'whatever', 'n‘t', 'other', 'against', 'above', "'ve", 'via', 'wherein', 'became', 'moreover', 'too', 'itself', 'somewhere', 'herein', 'beforehand', 'nor', 'amount', 'towards', "'s", 'a', 'part', '‘re', 'most', 'perhaps', 'thereafter', 'take', 'show', 'please', 'yourselves', 'mine', 'within', 'few', 'go', 'be', 'bottom', 'per', 'cannot', 'whereafter', 'her', 'who', 'we', 'much', 'each', 'besides', 'thereby', 'side', 'everywhere', 'nevertheless', 'hence', 'beside', 'they', 'in', 'hereafter', 'hers', 'an', 'has', 'been', 'less', 'made', 'others', 'still', 'were', 'should', 'have', 'from', 'wherever', 'eleven', 'formerly', 'until', 'just', 'along', 'as', 'being', 'first', 'using', 'hereby', 'sixty', 'anyone', 'becomes', 'although', 'six', 'these', 'anyway', 'further', 'down', 'out', "'ll", 'used', 'since', 'least', 'this', 'otherwise', 'sometime', 'either', 'may', 'noone', "'re", 'into', 'whenever', 'same', '’s', '

True

### Adding custom "stop-words":
*  For example, because we analyze a newspapers corpus it make sense to remove words wich are common in newspapers as "today", "Mr", "Ms", "said", etc. as they don't have significant value for topic modelling



In [6]:
# Define my custome stopwords:
custom_stopwords = {"the", "it"}

# Combine my stopwords with Spacy's stopwords
my_stopwords=STOP_WORDS|custom_stopwords

In [7]:
# Method to Preproscess our cleaned data 
# for use in Topic Modelling

def preproscess_text(docs, logging=False):
    texts = []
    counter = 1
    punctuations = string.punctuation
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        #tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = [tok for tok in tokens if tok not in my_stopwords]        
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [8]:
# === Preprocess and prepare our cleaned data 
#  and save it in new csv's, for later use in Topic Modeling:

processing_start_time_total = time.time()
i=0
for y in my_directory_cleaned: 
    processing_start_time_year = time.time()
    doc_text = pd.read_csv(data_location_cleaned + y, encoding='utf-8', dtype=str) #header = None)
    #doc_text  = doc_text[doc_text['article'].notnull()]   # Removesw null values
    doc_text = doc_text.dropna(how='any',axis=0) # Removesw null values
    
    #Combine "article" and "title" columns: [OPTIONAL] 
    doc_text["article"] = doc_text["title"].map(str) + " " + doc_text["article"].map(str)    
    #print(doc_text.head())
    
    print("Processing year: ", str(y), " ...")
    cleaned_= preproscess_text(doc_text["article"], logging=True) 
    doc_text = doc_text.reset_index(drop=True)
    doc_text['processed'] = cleaned_
    
    processing_end_time_year = time.time()
    print("-"*100)
    print("Processed Time for: ", str(y), "-->", processing_end_time_year-processing_start_time_year)
    
    #Save Preproscessed Data to csv:
    print("Saving processed data ... ")
    doc_text['article'] = doc_text['processed']
    del doc_text['processed']
    title_=y
    title_ = title_.replace('_cleaned_', '')
    title_ = title_.replace('.csv', '')
    path_cleaned =  '.\\data_cleaned_processed\\'
    doc_text.to_csv(path_cleaned + '_cleaned_'+'spacy_'+title_+'.csv', index = None)#, header = False)    
    
    
processing_end_time_total = time.time()
print("*"*100)
print("Total time: ", processing_end_time_total-processing_start_time_total)

Processing year:  _cleaned_1864_test.csv  ...
Processed 1000 out of 2998 documents.
Processed 2000 out of 2998 documents.
----------------------------------------------------------------------------------------------------
Processed Time for:  _cleaned_1864_test.csv --> 73.08807158470154
Saving processed data ... 
Processing year:  _cleaned_1900_test.csv  ...
----------------------------------------------------------------------------------------------------
Processed Time for:  _cleaned_1900_test.csv --> 5.750996112823486
Saving processed data ... 
****************************************************************************************************
Total time:  79.18606686592102


In [11]:
#print(doc_text.head())