In [67]:
!pip install seaborn
!pip install ipympl
%matplotlib inline



You should consider upgrading via the 'c:\py\pyproj\2022env387mit808\scripts\python.exe -m pip install --upgrade pip' command.


# Global Variables

In [1]:
_debug=False
nb_name = "1.0-sej-initial-data-exploration"
fn_data = "query-sdg-full-regexp_2022-03-15.csv"

# Import Data

In [2]:
# Read source Excel
import pandas as pd
df = pd.read_csv('../../data/raw/' + fn_data, sep='|', header=0, 
                dtype = {'issn': str, 'doi': str})

In [3]:
# Source sample
df.head(2)

Unnamed: 0,articletype_id,sdg_lst,ptr_id,authors,title,content,keywords,lang,date,issn,doi,handle,institution_id,institution,active
0,13.0,"2, 14",13,"{""Jansen, Jonathan D.""}",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,"{Autonomy,Learning,Performance,Teaching,""Gover...",en,2006-01-27,,,http://hdl.handle.net/2263/116,1,University of Pretoria,t
1,14.0,2,14,"{""Jansen, Jonathan D.""}",Intellectuals under fire,Looks at the status of intellectuals in South ...,"{""Cultural policy"",Democracy,""Political system...",en,2006-01-27,,,http://hdl.handle.net/2263/117,1,University of Pretoria,t


In [4]:
n=0
print('Title:\n', df.title[n], '\n\n', 'Content:\n',  df.content[n])

Title:
 Autonomy and accountability in the regulation of the teaching profession : a South African case study 

 Content:
 This article examines the struggles of the South African government to establish school-wide evaluation policies within post-apartheid institutions. It is demonstrated that even when such evaluation policies promise teacher development and whole-school improvement, there is significant resistance to government intervention in the school environment. It is also shown that even when individual schools express a willingness to participate in such evaluation actions, they remain deeply suspicious of, and even subvert, the original goals of these policies. The explanation for such behaviour is lodged within the troubled history of the apartheid inspection system, on the one hand, and on the underestimation in policy design of the deep-rooted suspicions of state surveillance systems even under the terms of a new democracy. In conclusion, the article shows how this fierce

# Evaluate Data Quality

In [5]:
# Unique ID
df.ptr_id.is_unique

True

In [6]:
# Unique ID (Main primary key)
df.articletype_id.is_unique

True

In [7]:
# Check for nulls and fill 
null_columns=df.columns[df.isnull().any()]
print('Columns with nulls:\n', df[null_columns].isnull().sum())
for col in null_columns:
    df[col]=df[col].fillna(0)

Columns with nulls:
 articletype_id         1
sdg_lst                1
content               23
lang              122807
issn              103329
doi               195081
dtype: int64


In [8]:
# Check for na and drop na values
df1 = df[df.isna().any(axis=1)]
print(df1.shape)
del df1
df = df.dropna()

(0, 15)


In [9]:
# Check sum: ptr_id should equal articletype_id
print('count mismatchees:', df.loc[ (df.ptr_id != df.articletype_id)].shape)
if _debug:
    print(df.loc[ (df.ptr_id != df.articletype_id)])
    print('shape', df.shape)
    print('drop', df.drop(df[ (df.ptr_id != df.articletype_id) ].index, inplace = True))
df.drop(df[ (df.ptr_id != df.articletype_id) ].index, inplace = True)

count mismatchees: (1, 15)


In [10]:
# Drop na values

df = df.dropna(subset=['sdg_lst','articletype_id'])

In [11]:
# Final dataset shape
df.shape

(208806, 15)

### Clean up keywords feature to have unique phrases.

In [12]:
kw=df.keywords

In [13]:
if _debug:
    print(kw[0])

In [14]:
# strip left of '{'
left=kw.str.split('{').str[1]
# strip right of '}'
right=left.str.split('}').str[0]
kw=right.str.replace('"', "")

In [15]:
if _debug:
    print(kw[0])

In [16]:
# Convert delimited strings into lists; and keep unique strings. 
list_all = kw.str.split(',')
list_unique = []
for i, v in enumerate(list_all):
    list_unique.append(list(set(list_all[i])))   

In [17]:
if _debug:
    n=1
    print(list_unique[n])
    print(list_all[n])
    print(len(list_unique))

In [18]:
df['unique_keywords']=list_unique
del kw, list_all, list_unique

In [19]:
df.head(1).T

Unnamed: 0,0
articletype_id,13.0
sdg_lst,"2, 14"
ptr_id,13
authors,"{""Jansen, Jonathan D.""}"
title,Autonomy and accountability in the regulation ...
content,This article examines the struggles of the Sou...
keywords,"{Autonomy,Learning,Performance,Teaching,""Gover..."
lang,en
date,2006-01-27
issn,0


### Convert sdg text into lists

In [20]:
sdg=df.sdg_lst

In [21]:
# list of strings
list_all = sdg.str.split(',')

#ToDo better code needed here
# list of integers 
n=0
list_int = []
for i, v1 in enumerate(list_all):
    lst = []
    
    try:
        for j, v2 in enumerate(list_all[i]):
            lst.append(int(v2))
        n=+1
    except:
        print('skip: ', i, n, list_all[i])
    
    list_int.append(lst)  

if _debug:
    print(list_all[0])
    print(list_int[0])
    
#append to df
df['sdg_ints']=list_int
del sdg, list_all, list_int

# EDA high level

In [22]:
df.columns

Index(['articletype_id', 'sdg_lst', 'ptr_id', 'authors', 'title', 'content',
       'keywords', 'lang', 'date', 'issn', 'doi', 'handle', 'institution_id',
       'institution', 'active', 'unique_keywords', 'sdg_ints'],
      dtype='object')

### Counts

In [23]:
# Counts
c_lang = df.lang.value_counts()
c_authors = df.authors.value_counts()
c_title = df.title.value_counts()

In [24]:
c_lang 

0             122807
en             39746
en_US          24287
eng            16322
jpn             2747
en_ZA            969
en_AU            844
Afrikaans        416
af               314
English          112
other            103
Dutch             38
af_ZA             31
de                17
fr                11
fra               10
it                 5
es                 4
Spanish            3
Greek              3
zho                2
spa                2
nl                 2
ja                 1
kor                1
Chinese            1
Language           1
Portuguese         1
en_GB              1
German             1
Sepedi             1
Afr                1
afr                1
zh                 1
Name: lang, dtype: int64

In [25]:
c_authors

{"Taylor, Frank E."}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [26]:
c_title

Editorial                                                                                                                                          18
2021 roadmap for sodium-ion batteries                                                                                                              10
Search for the HH → b b ¯ b b ¯ process via vector-boson fusion production using proton-proton collisions at s = 13 TeV with the ATLAS detector     9
Higgs boson production cross-section measurements and their EFT interpretation in the 4 ℓ decay channel at s = 13 TeV with the ATLAS detector       7
Study of B s 0 → J / ψπ + π − K + K − decays                                                                                                        7
                                                                                                                                                   ..
Citron kinase - renaissance of a neglected mitotic kinase                                           


### embeddings

In [27]:
#!pip install nltk
#!pip install gensim

import pandas as pd
import numpy as np

import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')

%matplotlib notebook
import matplotlib.pyplot as plt

In [28]:
print(len(stop_words))

179


In [29]:
n=0
print('Title:\n', df.title[n], '\n\n', 'Content:\n',  df.content[n])

Title:
 Autonomy and accountability in the regulation of the teaching profession : a South African case study 

 Content:
 This article examines the struggles of the South African government to establish school-wide evaluation policies within post-apartheid institutions. It is demonstrated that even when such evaluation policies promise teacher development and whole-school improvement, there is significant resistance to government intervention in the school environment. It is also shown that even when individual schools express a willingness to participate in such evaluation actions, they remain deeply suspicious of, and even subvert, the original goals of these policies. The explanation for such behaviour is lodged within the troubled history of the apartheid inspection system, on the one hand, and on the underestimation in policy design of the deep-rooted suspicions of state surveillance systems even under the terms of a new democracy. In conclusion, the article shows how this fierce

In [30]:
titles = df.title.values

# Go through all the titles and tokenize them by words
words = []
for doc in titles:
    # Go through sentences
    for sentence in sent_tokenize(doc):
        tokens = []
        stops = []
        # Collect lower case words that are not stop words
        for word in tokenizer.tokenize(sentence):
            wl = word.lower()
            if wl not in stop_words:
                tokens.append(wl)
            else: 
                stops.append(wl)
        words.append(tokens)

In [31]:
print("Original doc: ", df.title[0])
print("Tokenized doc: ", words[0])
print("Num Words: ", len(words), 'stop words:', len(stops))
print("Stop words:", stops)

Original doc:  Autonomy and accountability in the regulation of the teaching profession : a South African case study
Tokenized doc:  ['autonomy', 'accountability', 'regulation', 'teaching', 'profession', 'south', 'african', 'case', 'study']
Num Words:  214376 stop words: 3
Stop words: ['with', 'a', 's']


### Per SDG article

In [32]:
def preprocess(Str):
    Str = Str.replace("(<br/>)", "")
    Str = Str.replace('(<a).*(>).*(</a>)', '')
    Str = Str.replace('(&amp)', '')
    Str = Str.replace('(&gt)', '')
    Str = Str.replace('(&lt)', '')
    Str = Str.replace('(\xa0)', ' ')  
    return Str

In [33]:
n=0
#print(preprocess(df.content[n]))
print("Words:", len(df.content[n].split()))

Words: 148


In [41]:
def word_count(string):
# The len of the list is the
    # total count of words.
    return(len(string.split(" ")))
    
print(word_count(df.content[0]))

148


In [51]:
df['content_word_count'] = df.content.apply(lambda x: len(str(x).split()))

In [54]:
df['title_word_count'] = df.title.apply(lambda x: len(str(x).split()))

In [57]:
df['date'] = pd.to_datetime(df['date'],errors = 'coerce')

# Extract year and month

df['year'] = df.date.dt.year

df['month'] = df.date.dt.month

In [58]:
df.tail(1).T

Unnamed: 0,208805
articletype_id,229332.0
sdg_lst,7
ptr_id,229332
authors,"{""Ekman, Annaleigh Yahata""}"
title,Meeting Travel Needs: Becoming Reacquainted wi...
content,Our current examples of transportation needs a...
keywords,"{needs,travel,behavior,community,""urban planni..."
lang,0
date,2020-07-01 00:00:00
issn,0


# plots

In [65]:
#!pip install ipympl -  nto this

In [63]:
n=0

In [64]:
n=+1

plt.figure(n)
plt.hist(df['content_word_count'], bins=15)
plt.ylabel('Count')
plt.xlabel('Words')
plt.title("Distribution of content Word counts")
plt.xticks(range(1,18))
plt.yticks(rotation=60)
# plt.savefig("../../reports/figures/exploratory_data_analysis_visualisations/[%s]-[%s]-[%s].png"%(nb_name,\
#                                                                                                  dataset_1,\
#                                                                                                  description))
plt.show()

<IPython.core.display.Javascript object>