### Library Imports

In [1]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import os
from itertools import zip_longest
import numpy as np

import re
import string

### Read Data File

In [2]:
nb_name = "1.1-sej-data_preprocessing-report"

dataset_1 = 'preprocessed_data_query-sdg-full-regexp_2022-03-15.csv' # _1, _2 etc.. as need to list more than one data set being read in

# Preprocessing existing preprocessed_data file

save_1 = dataset_1.replace('preprocessed_data_','').replace('.csv', '')+'_1-1.csv'  

In [3]:
df = pd.read_csv('../../data/processed/'+dataset_1) 

# Pandas will start looking from where your current python file is located. 
# Therefore you can move from your current directory to where your data is located with '..'

  df = pd.read_csv('../../data/processed/'+dataset_1)


In [4]:
df.head(2)

Unnamed: 0,articletype_id,ptr_id,sdg_lst,authors,title,content,keywords,date,year,month,...,1_listed_sdg,2_listed_sdg,3_listed_sdg,4_listed_sdg,5_listed_sdg,6_listed_sdg,7_listed_sdg,8_listed_sdg,9_listed_sdg,language
0,13.0,13,"2, 14","['Jansen, Jonathan D.']",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,['accountability' 'autonomy' 'democracy' 'educ...,2006-01-27,2006.0,1.0,...,2,14,0,0,0,0,0,0,0,English
1,14.0,14,2,"['Jansen, Jonathan D.']",Intellectuals under fire,Looks at the status of intellectuals in South ...,['cultural policy' 'democracy' 'intellectuals'...,2006-01-27,2006.0,1.0,...,2,0,0,0,0,0,0,0,0,English


### Add features

In [5]:
# Count words in the title
# Removing punctuation before counting. Titles are assumed to have two or more words.

df['title_word_count'] = df.title.apply(lambda x: len(str(re.sub(r'[^\w\s]', ' ', x)).split()) )

In [6]:
# Count words in the content. 
# Removing punctuation before counting: the content needs to contain at least two words to use the regular expression function to remove punctuation. 

df['content_word_count'] = df.content.apply(lambda x: len(str(x).split()) if len(str(x).split()) <= 1 else len(str(re.sub(r'[^\w\s]', ' ', x)).split()) )

In [7]:
# Resulting additional features.

df.loc[:2,['title_word_count','content_word_count']]

Unnamed: 0,title_word_count,content_word_count
0,15,158
1,3,38
2,16,103


### Evaluation

In [8]:
# Descriptive stats on the additional features.

df[['title_word_count','content_word_count']].describe()

Unnamed: 0,title_word_count,content_word_count
count,208806.0,208806.0
mean,13.489814,200.479435
std,5.430777,91.308269
min,1.0,1.0
25%,10.0,144.0
50%,13.0,196.0
75%,17.0,256.0
max,105.0,5664.0


### Filter content column

In [9]:
# determine of the string is pseudo English by removing punctuation and evaluating the string alphanumerically

def isEnglish(s):
    
    # if we can not split then assess the string as is.
    
    if len(str(s).split()) <= 1: 
        word = s.translate(str.maketrans('', '', string.punctuation))
        
        # assess each word alphanumerically
        
        if word.isalnum()==False:
            return False
    
    # if we can split then assess each word
    
    else:
        
        # remove punctuation as best as possible
        
        s_out = s.translate(str.maketrans('', '', string.punctuation))
        s_out = re.sub(r'[^\w\s]', ' ', s_out)
        
        # assess each word alphanumerically
        
        words = s_out.split()
        for word in words:
            if word.isalnum()==False:
                    return False
    return True

In [10]:
# Show sample of unique content rows of 3 or less words; and flag is English or not

show_lst = df.query('content_word_count<=3').content.unique()
df1 = pd.DataFrame(show_lst, columns={'show_lst'})
df1['isEng'] = df1.show_lst.apply(lambda x: isEnglish(str(x)))
df1.query('isEng == False')[:5]

Unnamed: 0,show_lst,isEng
52,.,False
56,-,False
57,Thepresynapticproteina-synuclein(aSyn)isan‘int...,False
61,戦後中国における日本人戦犯裁判の戦い〜正義とその正当性,False
63,/,False


In [11]:
# filtering index where content_word_count is less than 3

drop_indexes = df[df['content_word_count'] <= 3].index
print('Dropping:' , drop_indexes.shape[0], ' rows')

#droping row based on column value

df.drop(drop_indexes,inplace=True)

Dropping: 3272  rows


In [12]:
# List unique content rows

drop_lst = df.query('content_word_count<=5').content.unique().tolist()
drop_lst[:20]

['Unity in politics and religion.',
 'Please read abstract in article.',
 'Abstract available in article.',
 'Abstract  available in article.',
 'Refer to the document',
 'We derive approximations for the',
 '. The detection, thanks to the',
 'ATCA 21 cm H',
 'ENGLISH ABSTRACT: No abstract available',
 'No abstract is available',
 'Erratum to: JHEP08(2011)054',
 'Transcriptional regulation in vitro',
 'It was the mid 1990s [...]',
 'A description of Cambridge colleges',
 'Analysis of Cambridge College system',
 'This is a corrigendum',
 'rave stars in K2',
 'not applicable for letter',
 'Can inflammation cause depression?',
 'This is a commentary']

### Save Final Data

In [13]:
df.head(2)

Unnamed: 0,articletype_id,ptr_id,sdg_lst,authors,title,content,keywords,date,year,month,...,3_listed_sdg,4_listed_sdg,5_listed_sdg,6_listed_sdg,7_listed_sdg,8_listed_sdg,9_listed_sdg,language,title_word_count,content_word_count
0,13.0,13,"2, 14","['Jansen, Jonathan D.']",Autonomy and accountability in the regulation ...,This article examines the struggles of the Sou...,['accountability' 'autonomy' 'democracy' 'educ...,2006-01-27,2006.0,1.0,...,0,0,0,0,0,0,0,English,15,158
1,14.0,14,2,"['Jansen, Jonathan D.']",Intellectuals under fire,Looks at the status of intellectuals in South ...,['cultural policy' 'democracy' 'intellectuals'...,2006-01-27,2006.0,1.0,...,0,0,0,0,0,0,0,English,3,38


In [14]:
df[['articletype_id',  
    'ptr_id', 
    'sdg_lst',
    'authors', 
    'title', 
    'content',
    'keywords',
    'date',
    'year',
    'month',
    'issn', 
    'doi', 
    'handle', 
    'institution_id',
    'institution', 
    'active', 
    'no_listed_sdg', 
    '1_listed_sdg',
    '2_listed_sdg', '3_listed_sdg', '4_listed_sdg', '5_listed_sdg',
    '6_listed_sdg', '7_listed_sdg', '8_listed_sdg', '9_listed_sdg',
    'language',
   
    'title_word_count',
    'content_word_count'
    
   ]].to_csv('../../data/processed/'+'preprocessed_data_'+save_1, index=False)