# Clean & Lemmatise Corpus

In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import json
from nltk.tokenize import RegexpTokenizer
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# read in data 
with open('phys_all.txt', 'r') as infile:
    data_dic = json.load(infile)
infile.close()

In [10]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# get the abstracts for certain years
abstracts_1995 = []
abstracts_2005 = []
abstracts_2015 = []
for key in data_dic.keys(): 
    if data_dic[key]["year"] in [1994, 1995, 1996]:
        abstracts_1995.append(data_dic[key]["abstract"])
    elif data_dic[key]["year"] in [2004, 2005, 2006]:
        abstracts_2005.append(data_dic[key]["abstract"])
    elif data_dic[key]["year"] in [2014, 2015, 2016]:
        abstracts_2015.append(data_dic[key]["abstract"])
    else: 
        pass

In [3]:
def tokenise_n_grams_lemma(abstracts_list): 

    # Split doc to tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(abstracts_list)):
        abstracts_list[idx] = abstracts_list[idx].lower()  # Convert to lowercase.
        abstracts_list[idx] = tokenizer.tokenize(abstracts_list[idx])  # Split into words.

    # Remove numbers
    docs_filter1 = [[token for token in doc if not token.isnumeric()] for doc in abstracts_list]

    # Remove words less than 2 characters
    docs_filter2 = [[token for token in doc if len(token) > 1] for doc in docs_filter1]

    # Lemmatise
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs_lemm = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs_filter2]

    # Compute bigrams and trigrams
    from gensim.models import Phrases
    bigram = gensim.models.Phrases(docs_lemm, min_count=10) #, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[docs_lemm], min_count=10) #, threshold=100)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    for id_x in range(len(docs_lemm)):
        for token in trigram_mod[bigram_mod[docs_lemm[id_x]]]:
            if '_' in token:
                # Token is a bigram or trigram and will be added to the documents
                docs_lemm[idx].append(token)

    # Remove stop words
    docs_lemm_clean = [[token for token in doc if token not in stop_words] for doc in docs_lemm]    
    
    return docs_lemm_clean

In [12]:
docs_lemm_clean_phys_1995 = tokenise_n_grams_lemma(abstracts_1995)
docs_lemm_clean_phys_2005 = tokenise_n_grams_lemma(abstracts_2005)
docs_lemm_clean_phys_2015 = tokenise_n_grams_lemma(abstracts_2015)

In [13]:
docs_lemm_clean_phys_1995[1]

['describe',
 'simple',
 'model',
 'evolution',
 'incorporates',
 'branching',
 'extinction',
 'specie',
 'line',
 'also',
 'includes',
 'abiotic',
 'influence',
 'first',
 'principle',
 'approach',
 'taken',
 'probability',
 'speciation',
 'extinction',
 'defined',
 'purely',
 'term',
 'fitness',
 'landscape',
 'specie',
 'numerical',
 'simulation',
 'show',
 'total',
 'diversity',
 'fluctuates',
 'around',
 'natural',
 'system',
 'size',
 'n_',
 'rm',
 'nat',
 'weakly',
 'depends',
 'upon',
 'number',
 'connection',
 'per',
 'specie',
 'agreement',
 'known',
 'data',
 'real',
 'multispecies',
 'community',
 'numerical',
 'result',
 'confirmed',
 'approximate',
 'mean',
 'field',
 'analysis',
 'which_incorporates',
 'first_principle',
 'numerical_simulation',
 'show_that',
 'n__rm',
 'only_weakly',
 'depends_upon',
 'agreement_with',
 'numerical_result',
 'mean_field']

In [16]:
len(docs_lemm_clean_phys_1995), len(docs_lemm_clean_phys_2005), len(docs_lemm_clean_phys_2015)

(30145, 111737, 176383)

#### How many words does the corpus have? 

In [18]:
datas = [docs_lemm_clean_phys_1995, docs_lemm_clean_phys_2005, docs_lemm_clean_phys_2015]
years = [1995, 2005, 2015]

print("Words in corpus for different years:")
for year, data in zip(years, datas):
    counter = 0
    for i in range(len(data)): 
        counter += len(data[i])
    print(year,":", counter)

Words in corpus for different years:
1995 : 2265384
2005 : 9783946
2015 : 19108495


#### Save data to file 

In [12]:
with open('docs_lemm_clean_phys_1995.txt', 'w') as outfile:
    json.dump(docs_lemm_clean_phys_1995, outfile)
outfile.close()

In [13]:
with open('docs_lemm_clean_phys_2005.txt', 'w') as outfile:
    json.dump(docs_lemm_clean_phys_2005, outfile)
outfile.close()

In [14]:
with open('docs_lemm_clean_phys_2015.txt', 'w') as outfile:
    json.dump(docs_lemm_clean_phys_2015, outfile)
outfile.close()

### Which documents were published when 

In [4]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    
# abstracts for years
abs2year_dic = {}
for num, key in enumerate(data_dic.keys()): 
    tem_dic = {}
    abstract = tokenise_n_grams_lemma([data_dic[key]["abstract"]])
    tem_dic["abstract"] = abstract[0]
    tem_dic["year"] = data_dic[key]["year"]
    abs2year_dic[num] = tem_dic

In [5]:
#save data to a file
with open('abs2year_dic.txt', 'w') as outfile:
    json.dump(abs2year_dic, outfile)
outfile.close()