In [10]:
import pandas as pd

papers = pd.read_csv(
    "../rawData/sampler_10ktexts_perdecade.ALL2.tsv",
    #"../rawData/sampler_50ktexts_perdecade.headed.ALL.tsv",
    sep='\t',
    names=["timePeriod", "index", "rawText"]
)

papers.drop(papers[~papers['timePeriod'].str.endswith('POS.rand')].index, inplace=True)

papers.head()

Unnamed: 0,timePeriod,index,rawText
0,180X.POS.rand,1,The_DT hon_NN ._SENT
1,180X.POS.rand,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,180X.POS.rand,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,180X.POS.rand,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,180X.POS.rand,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [11]:
papers['timePeriod'] = papers['timePeriod'].map(lambda x: x.rstrip('X.POS.rand'))
papers['timePeriod'] = papers['timePeriod'].astype(str) + '0'
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800,1,The_DT hon_NN ._SENT
1,1800,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [12]:
papers['timePeriod'] = pd.to_datetime(papers['timePeriod'], format='%Y')
papers['timePeriod'] = pd.DatetimeIndex(papers['timePeriod'])
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [13]:
#reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1830-01-01"))]
#reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1890-01-01"))]
#reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1820-01-01"))]

reducedPapers = papers.groupby("timePeriod").sample(frac=0.1, random_state=42)
#reducedPapers = papers
reducedPapers.head()
#reducedPapers.tail()

Unnamed: 0,timePeriod,index,rawText
4963,1800-01-01,6909,In_IN Wales_NP the_DT reformation_NN had_VBD b...
6125,1800-01-01,8466,Will_MD the_DT negroes_NNS be_VB indifferent_J...
6377,1800-01-01,8884,gent_NN ._SENT
3647,1800-01-01,4943,But_CC when_WRB the_DT noble_JJ lord_NN talked...
1615,1800-01-01,2054,"Of_IN the_DT two_CD other_JJ parties_NNS ,_, t..."


In [14]:
import nltk
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(nltk.corpus.stopwords.words('english'))

stop_words.update({
    'hon',
    'mr',
    'gentleman',
    'gent',
    'gen',
    'right',
    'house',
    'opinion',
    'sir',
    'say',
    'friend',
    'member',
    'think',
    'go',
    'motion'
    'noble',
    'lord',
    'case',
    'year',
    'speak',
    'speech'
})

wn = WordNetLemmatizer()

def lemmatize(word):
    if '_' not in word: return ""
    token, posStart = word.split('_')[0].lower(), word.split('_')[1].upper()
    if posStart.startswith('N'):
        return wn.lemmatize(token, wordnet.NOUN)
    elif posStart.startswith('V'):
        return wn.lemmatize(token, wordnet.VERB)
    elif posStart.startswith('R'):
        return wn.lemmatize(token, wordnet.ADV)
    elif posStart.startswith('J'):
        return wn.lemmatize(token, wordnet.ADJ)
    return ""

def preprocess_text(text):
    tokens = simple_preprocess(text)
    clean = [lemmatize(word) for word in tokens if lemmatize(word)!=""]
    words = [word for word in clean if word.isalpha()]
    lemmas = [word for word in words if word not in stop_words]
    return " ".join(lemmas)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
reducedPapers.loc[:, 'processedText'] = reducedPapers['rawText'].apply(preprocess_text).astype(str)
reducedPapers.loc[:, 'processedText'] = reducedPapers['processedText'].astype("str")
reducedPapers.dropna(subset = ['processedText'], inplace = True, how='any')
reducedPapers.reset_index(drop=True, inplace=True)
reducedPapers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16154 entries, 0 to 16153
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timePeriod     16154 non-null  datetime64[ns]
 1   index          16154 non-null  int64         
 2   rawText        16154 non-null  object        
 3   processedText  16154 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 504.9+ KB


In [16]:
reducedPapers.to_csv('../processedData/processedData.csv', sep=',', columns=[ 'index','timePeriod', 'processedText'])

In [17]:
test = pd.read_csv(
    "../processedData/processedData.csv",
    sep=',',
    index_col=0
)
test.tail()


Unnamed: 0,index,timePeriod,processedText
16149,7692,2000-01-01,see qualitative difference code extradition ac...
16150,9008,2000-01-01,many noble mention blocking lack care home cau...
16151,2769,2000-01-01,select committee note large proportion politic...
16152,3990,2000-01-01,raise issue profound importance evolve constit...
16153,852,2000-01-01,however devil detail
