### Library Imports

In [1]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import os
from itertools import zip_longest
import numpy as np

### Read Data File

In [4]:
nb_name = "1.0-jp-initial-data-exploration-report"

dataset_1 = 'query-sdg-full-regexp_2022-03-15.csv' # _1, _2 etc.. as need to list more than one data set being read in

In [5]:
df = pd.read_csv('../../data/raw/'+dataset_1, sep='|', header=0) 

# Pandas will start looking from where your current python file is located. 
# Therefore you can move from your current directory to where your data is located with '..'

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
df.head()

Unnamed: 0,articletype_id,sdg_lst,ptr_id,authors,title,content,keywords,lang,date,issn,doi,handle,institution_id,institution,active
0,13.0,"2, 14",13,"{""Jansen, Jonathan D.""}",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,"{Autonomy,Learning,Performance,Teaching,""Gover...",en,2006-01-27,,,http://hdl.handle.net/2263/116,1,University of Pretoria,t
1,14.0,2,14,"{""Jansen, Jonathan D.""}",Intellectuals under fire,Looks at the status of intellectuals in South ...,"{""Cultural policy"",Democracy,""Political system...",en,2006-01-27,,,http://hdl.handle.net/2263/117,1,University of Pretoria,t
2,16.0,2,16,"{""Jansen, Jonathan D.""}",Mode 2 knowledge and institutional life: takin...,This paper examines the response of a black un...,"{""Higher education"",""Information technology"",""...",en,2006-01-27,,,http://hdl.handle.net/2263/119,1,University of Pretoria,t
3,17.0,"1, 14",17,"{""Jansen, Jonathan D.""}",Political symbolism as policy craft : explaini...,The policy literature in developing countries ...,"{Apartheid,""Educational change"",""Educational p...",en,2006-01-28,,,http://hdl.handle.net/2263/130,1,University of Pretoria,t
4,18.0,14,18,"{""Jansen, Jonathan D.""}",School curriculum since apartheid : intersecti...,In the wake of South Africa's first non-racial...,"{""Curriculum development"",""Educational policy""...",en,2006-01-28,,,http://hdl.handle.net/2263/131,1,University of Pretoria,t


In [9]:
# Drop nulls on id columns - indicating an ineffective join
df = df.dropna(subset=['sdg_lst','articletype_id'])

### Listed SDG's - Fleshing Out

In [10]:
# get the number different sdg's listed

listed_number = list()

zero_list = [0,0,0,0,0,0,0,0,0] # 7 zeroes to list as the maximum number of listed sdg's per article is 7

expanded_list = list()

for i in df.sdg_lst.values:

    # remove ',' in the string and split the string by white space to get a list
    
    split = [int(z)for z in i.replace(',','').split()]
    
    # replace zeroes in the zero list with corresponding entry of listed sdg on article
    # eg. [14,2] + [0,0,0,0,0,0,0] = [14,2,0,0,0,0,0] -> done for feature creation
    
    add_lists = [sum(n) for n in zip_longest(split, zero_list, fillvalue=0)]
    
    expanded_list.append(add_lists)
    
    listed_number.append(len(split))
    
# Get a column listing the number of sdg per article for distributional visualisation 
    
df['no_listed_sdg'] = listed_number

# column per listing

df[['%s_listed_sdg'%(i) for i in range(1,10)]] = pd.DataFrame(expanded_list)

### Preprocess Keyword Columns

In [12]:
unique_keywords = list()

for i in df.keywords:
    
    # remove {} brackets, remove random " on words and split by comma into a list
    
    # iterate over this list and lower the keywords
    
    words = [j.lower() for j in i.replace('{','')\
                                 .replace('}','')\
                                 .replace('"','')\
                                 .split(',')]
    
    # Get unique set of words 
    
    unique_words = np.unique(np.array(words))
    
    unique_keywords.append(unique_words)

        
df['keywords'] = unique_keywords


### Process Language column

In [14]:
dct = {'en':'English', 
       'Afrikaans':'Afrikaans', 
       'de':'German', 
       'Afr':'Afrikaans', 
       'Dutch':'Dutch', 
       'en_US':'English', 
       'English':'English',
       'German':'German', 
       'Sepedi':'Sepedi', 
       'fr':'French', 
       'other':'Other', 
       'af':'Afrikaans', 
       'Language':'Other', 
       'eng':'English', 
       'afr':'Afrikaans',
       'en_ZA':'English', 
       'af_ZA':'Afrikaans', 
       'nl':'Dutch', 
       'es':'Spanish', 
       'zh':'Chinese', 
       'en_GB':'English', 
       'Portuguese':'Portuguese', 
       'it':'Italian',
       'Spanish':'Spanish', 
       'Greek':'Greek', 
       'ja':'Japanese', 
       'Chinese':'Chinese', 
       'fra':'French', 
       'jpn':'Japanese', 
       'spa':'Spanish', 
       'kor':'Korean',
       'zho':'Chinese', 
       'en_AU':'English'}

df['language'] = df.lang.map(dct)

### Process Authors Column

In [16]:
authors_list = list()

for i in df.authors:
    
    # remove {} brackets, split by quotation 
    
    # iterate over this list and lower the keywords
    
    words = [j for j in i.replace('{','')\
                                 .replace('}','')\
                                 .split('"')\
               if j not in [',','']
            ]
    
    authors_list.append(words)
    
df['authors'] = authors_list


### Process Date Column

In [17]:
df['date'] = pd.to_datetime(df['date'],errors = 'coerce')

# Extract year and month

df['year'] = df.date.dt.year

df['month'] = df.date.dt.month

### Save Final Data

In [18]:
df.head()

Unnamed: 0,articletype_id,sdg_lst,ptr_id,authors,title,content,keywords,lang,date,issn,...,3_listed_sdg,4_listed_sdg,5_listed_sdg,6_listed_sdg,7_listed_sdg,8_listed_sdg,9_listed_sdg,language,year,month
0,13.0,"2, 14",13,"[Jansen, Jonathan D.]",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,"[accountability, autonomy, democracy, educatio...",en,2006-01-27,,...,0,0,0,0,0,0,0,English,2006.0,1.0
1,14.0,2,14,"[Jansen, Jonathan D.]",Intellectuals under fire,Looks at the status of intellectuals in South ...,"[cultural policy, democracy, intellectuals, po...",en,2006-01-27,,...,0,0,0,0,0,0,0,English,2006.0,1.0
2,16.0,2,16,"[Jansen, Jonathan D.]",Mode 2 knowledge and institutional life: takin...,This paper examines the response of a black un...,"[application-based knowledge, higher education...",en,2006-01-27,,...,0,0,0,0,0,0,0,English,2006.0,1.0
3,17.0,"1, 14",17,"[Jansen, Jonathan D.]",Political symbolism as policy craft : explaini...,The policy literature in developing countries ...,"[apartheid, educational change, educational po...",en,2006-01-28,,...,0,0,0,0,0,0,0,English,2006.0,1.0
4,18.0,14,18,"[Jansen, Jonathan D.]",School curriculum since apartheid : intersecti...,In the wake of South Africa's first non-racial...,"[apartheid, curriculum development, educationa...",en,2006-01-28,,...,0,0,0,0,0,0,0,English,2006.0,1.0


In [None]:
df[['articletype_id',  
    'ptr_id', 
    'sdg_lst',
    'authors', 
    'title', 
    'content',
    'keywords',
    'date',
    'year',
    'month',
    'issn', 
    'doi', 
    'handle', 
    'institution_id',
    'institution', 
    'active', 
    'no_listed_sdg', 
    '1_listed_sdg',
    '2_listed_sdg', '3_listed_sdg', '4_listed_sdg', '5_listed_sdg',
    '6_listed_sdg', '7_listed_sdg', '8_listed_sdg', '9_listed_sdg',
    'language']].to_csv('../../data/processed/'+'preprocessed_data_'+dataset_1,index=False)