# Global Variables

In [1]:
_debug=False
nb_name = "1.0-sej-initial-data-exploration"
fn_data = "query-sdg-full_20220313.csv"

# Import Data

In [2]:
# Read source Excel
import pandas as pd
df = pd.read_csv('../../data/raw/' + fn_data, sep='|', header=0, 
                dtype = {'issn': str, 'doi': str})

In [3]:
# Source sample
df.head(2)

Unnamed: 0,articletype_id,sdg_lst,ptr_id,authors,title,content,keywords,lang,date,issn,doi,handle,institution_id,institution,active
0,13.0,"14, 2",13,"{""Jansen, Jonathan D.""}",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,"{Autonomy,Learning,Performance,Teaching,""Gover...",en,2006-01-27,,,http://hdl.handle.net/2263/116,1,University of Pretoria,t
1,14.0,2,14,"{""Jansen, Jonathan D.""}",Intellectuals under fire,Looks at the status of intellectuals in South ...,"{""Cultural policy"",Democracy,""Political system...",en,2006-01-27,,,http://hdl.handle.net/2263/117,1,University of Pretoria,t


In [4]:
n=0
print('Title:\n', df.title[n], '\n\n', 'Content:\n',  df.content[n])

Title:
 Autonomy and accountability in the regulation of the teaching profession : a South African case study 

 Content:
 This article examines the struggles of the South African government to establish school-wide evaluation policies within post-apartheid institutions. It is demonstrated that even when such evaluation policies promise teacher development and whole-school improvement, there is significant resistance to government intervention in the school environment. It is also shown that even when individual schools express a willingness to participate in such evaluation actions, they remain deeply suspicious of, and even subvert, the original goals of these policies. The explanation for such behaviour is lodged within the troubled history of the apartheid inspection system, on the one hand, and on the underestimation in policy design of the deep-rooted suspicions of state surveillance systems even under the terms of a new democracy. In conclusion, the article shows how this fierce

# Evaluate Data Quality

In [5]:
# Unique ID
df.ptr_id.is_unique

True

In [6]:
# Unique ID (Main primary key)
df.articletype_id.is_unique

True

In [7]:
# Check for nulls and fill 
null_columns=df.columns[df.isnull().any()]
print('Columns with nulls:\n', df[null_columns].isnull().sum())
for col in null_columns:
    df[col]=df[col].fillna(0)

Columns with nulls:
 articletype_id         1
sdg_lst                1
content               23
lang              122807
issn              103329
doi               195081
dtype: int64


In [8]:
# Check for na and drop na values
df1 = df[df.isna().any(axis=1)]
print(df1.shape)
del df1
df = df.dropna()

(0, 15)


In [9]:
# Check sum: ptr_id should equal articletype_id
print('count mismatchees:', df.loc[ (df.ptr_id != df.articletype_id)].shape)
if _debug:
    print(df.loc[ (df.ptr_id != df.articletype_id)])
    print('shape', df.shape)
    print('drop', df.drop(df[ (df.ptr_id != df.articletype_id) ].index, inplace = True))
df.drop(df[ (df.ptr_id != df.articletype_id) ].index, inplace = True)

count mismatchees: (1, 15)


In [10]:
# Final dataset shape
df.shape

(208806, 15)

### Clean up keywords feature to have unique phrases.

In [11]:
kw=df.keywords

In [12]:
if _debug:
    print(kw[0])

In [13]:
# strip left of '{'
left=kw.str.split('{').str[1]
# strip right of '}'
right=left.str.split('}').str[0]
kw=right.str.replace('"', "")

In [14]:
if _debug:
    print(kw[0])

In [15]:
# Convert delimited strings into lists; and keep unique strings. 
list_all = kw.str.split(',')
list_unique = []
for i, v in enumerate(list_all):
    list_unique.append(list(set(list_all[i])))   

In [16]:
if _debug:
    n=1
    print(list_unique[n])
    print(list_all[n])
    print(len(list_unique))

In [17]:
df['unique_keywords']=list_unique
del kw, list_all, list_unique

In [18]:
df.head(1).T

Unnamed: 0,0
articletype_id,13.0
sdg_lst,"14, 2"
ptr_id,13
authors,"{""Jansen, Jonathan D.""}"
title,Autonomy and accountability in the regulation ...
content,This article examines the struggles of the Sou...
keywords,"{Autonomy,Learning,Performance,Teaching,""Gover..."
lang,en
date,2006-01-27
issn,0


### Convert sdg text into lists

In [19]:
sdg=df.sdg_lst

In [20]:
# list of strings
list_all = sdg.str.split(',')

#ToDo better code needed here
# list of integers 
n=0
list_int = []
for i, v1 in enumerate(list_all):
    lst = []
    
    try:
        for j, v2 in enumerate(list_all[i]):
            lst.append(int(v2))
        n=+1
    except:
        print('skip: ', i, n, list_all[i])
    
    list_int.append(lst)  

if _debug:
    print(list_all[0])
    print(list_int[0])
    
#append to df
df['sdg_ints']=list_int
del sdg, list_all, list_int

# EDA high level

In [21]:
df.columns

Index(['articletype_id', 'sdg_lst', 'ptr_id', 'authors', 'title', 'content',
       'keywords', 'lang', 'date', 'issn', 'doi', 'handle', 'institution_id',
       'institution', 'active', 'unique_keywords', 'sdg_ints'],
      dtype='object')

### Counts

In [22]:
# Counts
c_lang = df.lang.value_counts()
c_authors = df.authors.value_counts()
c_title = df.title.value_counts()

In [33]:
c_lang 

0             122807
en             39746
en_US          24287
eng            16322
jpn             2747
en_ZA            969
en_AU            844
Afrikaans        416
af               314
English          112
other            103
Dutch             38
af_ZA             31
de                17
fr                11
fra               10
it                 5
es                 4
Spanish            3
Greek              3
zho                2
spa                2
nl                 2
ja                 1
kor                1
Chinese            1
Language           1
Portuguese         1
en_GB              1
German             1
Sepedi             1
Afr                1
afr                1
zh                 1
Name: lang, dtype: int64

In [34]:
c_authors

{"Taylor, Frank E."}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [32]:
c_title

Editorial                                                                                                                                                  18
2021 roadmap for sodium-ion batteries                                                                                                                      10
Search for the HH → b b ¯ b b ¯ process via vector-boson fusion production using proton-proton collisions at s = 13 TeV with the ATLAS detector             9
Study of B s 0 → J / ψπ + π − K + K − decays                                                                                                                7
Preface                                                                                                                                                     7
                                                                                                                                                           ..
Automatic contour propagation using deformable image