In [1]:
import re
import os
import pandas as pd
import unidecode
import gensim
from stop_words import get_stop_words
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
import altair as alt
import copy

In [2]:
# Store data
data_list = os.listdir('./data')
sum_list = []
for i in data_list:
    a = []
    dir = './data'+'/'+i
    with open(dir,'r',encoding = 'utf-8') as f:
        content = f.read()
    f.close()
    a.append(i.split('.')[0])
    a.append(content)
    sum_list.append(a)

df = pd.DataFrame(sum_list,columns = ['year','values'])
df = df.set_index('year')
time_list = df.index.values.tolist()
value_data = df['values'].values.tolist()

In [3]:
# Clean data
remove_pattern = [
    (r'\s+[a-zA-Z]\s+', ' '),  # Remove all single characters
    (r'\^[a-zA-Z]\s+', ' '),
    (r'\d+', ''),  # Remove all numbers
    (r'\s+', ' '),  # Substitute multiple white spaces with single space
    (r'^b\s+', ''),  # Remove prefixed 'b'
                  ]
word_data = []

for abstract in range(0, len(value_data)):
    temp = unidecode.unidecode(re.sub(r'\W', ' ', str(value_data[abstract])))  # Remove all the special characters
    for p, r in remove_pattern:
        temp = re.sub(p, r, temp)
    word_data.append(temp)


def sentence_to_words(abstract):
    # If deacc=True then remove punctuations
    return (gensim.utils.simple_preprocess(str(abstract), deacc=True))

data_words = [sentence_to_words(x) for x in word_data]

In [8]:
# Get stop words list
stop_words = get_stop_words('english')

stop_words.extend(['actually', 'afterwards', 'almost', 'already', 'also', 'although', 'among', 'amongst', 'another', 'apart', 'around', 'aside', 'author', 'authors', 'away',
                   'become', 'becoming', 'better', 'beyond', 'certain', 'could', 'commonly', 'considerable', 'consider', 'de', 'definitely', 'eg', 'either', 'etc',
                   'f', 'g', 'h', 'hence', 'hereafter', 'herein', 'however', 'indeed', 'instead', 'illustrate', 'illustrates', 'demonstrate', 'demonstrates',
                   'important',  'j', 'k', 'likely', 'many', 'may', 'maybe', 'meanwhile', 'might', 'moreover', 'much', 'n', 'nevertheless', 'neither', 'normally', 'often',
                   'otherwise', 'particular', 'q', 'quite', 'understand', 'understood',
                   'rather', 'regardless', 'relatively', 'respectively', 'reveal', 'since', 'suggest', 'suggests', 'specifically', 'particularly', 'therefore', 'therein',
                   'though', 'thus', 'together', 'toward', 'towards', 'unless', 'upon', 'using', 'well', 'whereas', 'whereafter',
                   'whether',  'amico', 'edt', 'amp','mccarty_at_kcl', 'est', 'know', 'willard',
                   'htm',  'com', 'new', 'like', 'fqs',  'use', 'text', 'cch', 'bitnet', 'mccarty', 'kcl', 'html',
                    'org', 'ac', 'uk', 'edu', 'num', 'vol', 'date', 'href', 'www', 'http', 'will', 'ninch'])

def remove_stopwords(words):
    result = [x for x in words if x not in stop_words]
    return result

data_words_nostops = [remove_stopwords(x) for x in data_words] # Remove stop words

test = [" ".join(x) for x in data_words_nostops]

sum_list = copy.deepcopy(test)
a_list=sum_list[-8:]
b_list=sum_list[:-8]

In [9]:
def helper1(list1):
    # Instantiate tf instance
    model = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
    # Input training set matrix, each row represents one text
    model_fit = model.fit_transform(list1)
    word=model.get_feature_names()
    tfidf_matrix=model_fit.toarray()
    max_idf = tfidf_matrix.max(axis=0).tolist()
    s_d = [(word[i], max_idf[i]) for i in range(len(max_idf))]
    sortedList2 = sorted(s_d, key=lambda x: x[1])
    sd = sortedList2[-10:]
    print(sd)
    return word,tfidf_matrix

In [10]:
but,buts = helper1(sum_list)
but_a,buts_a=helper1(a_list)
but_b,buts_b=helper1(b_list)



[('utorepas', 1329.120536739557), ('research', 1399.0), ('computer', 1969.0), ('information', 2006.0), ('can', 2288.0), ('humanities', 2299.0), ('one', 2404.0), ('university', 2443.0), ('subject', 3604.0), ('humanist', 4495.0)]
[('digital', 1207.0), ('utorepas', 1246.4125319481634), ('information', 1318.0), ('humanities', 1538.0), ('computer', 1969.0), ('can', 2218.0), ('one', 2299.0), ('university', 2443.0), ('subject', 3321.0), ('humanist', 4495.0)]
[('re', 1273.0), ('language', 1294.0), ('research', 1399.0), ('information', 2006.0), ('can', 2288.0), ('humanities', 2299.0), ('university', 2327.0), ('one', 2404.0), ('subject', 3604.0), ('humanist', 4451.0)]


In [11]:
def helper2(word,but,buts,time_list):
    num=but.index(word)
    q=[{'year':time_list[j],'value':buts[j][num]}for j in range(0,len(buts))]
    df = pd.DataFrame(q)
    alt.Chart(df).mark_line(point=True).encode(
        x = 'year',
        y = 'value'
    )
    
helper2('sun',but,buts,time_list)