In [1]:
import pandas as pd
import os

papers = pd.read_csv(
    "./rawData/sampler_10ktexts_perdecade.ALL2.tsv",
    sep='\t',
    names=["timePeriod", "index", "rawText"]
)

papers.head()

Unnamed: 0,timePeriod,index,rawText
0,180X.POS.rand,1,The_DT hon_NN ._SENT
1,180X.POS.rand,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,180X.POS.rand,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,180X.POS.rand,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,180X.POS.rand,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [2]:
papers['timePeriod'] = papers['timePeriod'].map(lambda x: x.rstrip('X.POS.rand'))
papers['timePeriod'] = papers['timePeriod'].astype(str) + '0'
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800,1,The_DT hon_NN ._SENT
1,1800,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [3]:
papers['timePeriod'] = pd.to_datetime(papers['timePeriod'], format='%Y')
papers['timePeriod'] = pd.DatetimeIndex(papers['timePeriod']) #.year
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [4]:
reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1830-01-01"))]
reducedPapers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(nltk.corpus.stopwords.words('english'))
wn = WordNetLemmatizer()

def preprocess_text(text):
    tokens = [word.split('_')[0] for word in nltk.word_tokenize(text.lower())]
    words = [word for word in tokens if word.isalpha()]
    lemmas = [wn.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(lemmas)

reducedPapers['processedText'] = reducedPapers['rawText'].apply(preprocess_text)

reducedPapers.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reducedPapers['processedText'] = reducedPapers['rawText'].apply(preprocess_text)


Unnamed: 0,timePeriod,index,rawText,processedText
0,1800-01-01,1,The_DT hon_NN ._SENT,hon
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,gallant general commanded well knew reinforcem...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...",pitt said doubted whether would necessary inse...
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...",hussey informs u romish bishop waterford appoi...
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...,former time former war invasion often threaten...


In [6]:
reducedPapers.to_csv('./processedData/processedData.csv', sep=',')
test = pd.read_csv(
    "./processedData/processedData.csv",
    sep=',',
    names=["timePeriod", "index", "rawText"]
)

test.head(100)

Unnamed: 0,Unnamed: 1,timePeriod,index,rawText
,timePeriod,index,rawText,processedText
0.0,1800-01-01,1,The_DT hon_NN ._SENT,hon
1.0,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,gallant general commanded well knew reinforcem...
2.0,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...",pitt said doubted whether would necessary inse...
3.0,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...",hussey informs u romish bishop waterford appoi...
...,...,...,...,...
94.0,1800-01-01,129,If_IN the_DT house_NN was_VBD not_RB to_TO be_...,house composed representative people would inf...
95.0,1800-01-01,130,Having_VBG thus_RB far_RB examined_VBD the_DT ...,thus far examined nature charge let u inquire ...
96.0,1800-01-01,131,Their_PP$ falsehood_NN would_MD be_VB detected...,falsehood would detected would softer punishme...
97.0,1800-01-01,132,They_PP may_MD find_VB cause_VB unequivocally_...,may find cause unequivocally withdraw barrier ...
