In [1]:
import numpy as np
import pandas as pd 
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
names = ['doi', 'text_id', 'text', 'sdg', 'labels_negative', 
         'labels_positive', 'agreement']

# laod data
df = pd.read_csv('osdg.csv', error_bad_lines=False,
                sep="\t", header= None, names=names)
df = df.iloc[1:, :] 
df = df.drop(['doi', 'text_id', 'labels_negative', 
              'labels_positive'], axis = 1)


In [3]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [4]:
def get_top_n_words(corpus, n=None):
    vectorizer = TfidfVectorizer(sublinear_tf=False, 
                                 min_df=1, 
                                 max_features = 72400,
                                 stop_words='english', 
                                 ngram_range=(1, 2), 
                                 dtype = np.float32)
    vec = vectorizer.fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [5]:
for i in df.sdg.unique():
    corpus = df.iloc[np.where(df.sdg == i)]
    print(f'\n\nSDG goal {i}\n')
    print(get_top_n_words(corpus.text, n=20))



SDG goal 5

[('women', 143.77724), ('gender', 99.17344), ('countries', 62.29346), ('men', 57.057583), ('work', 53.005184), ('care', 48.279434), ('social', 47.68598), ('time', 43.62181), ('rights', 43.540306), ('equality', 42.422546), ('labour', 41.29094), ('education', 39.02199), ('family', 38.18262), ('violence', 37.86296), ('female', 37.489326), ('children', 36.80494), ('employment', 34.752563), ('oecd', 34.692043), ('gender equality', 34.504005), ('economic', 34.42297)]


SDG goal 11

[('urban', 41.75173), ('development', 33.167496), ('transport', 32.47507), ('public', 30.041132), ('city', 29.177776), ('cities', 29.152542), ('local', 25.922457), ('land', 25.837141), ('housing', 25.717167), ('government', 23.247772), ('national', 22.7793), ('areas', 22.3195), ('use', 21.506516), ('countries', 21.45858), ('planning', 20.961758), ('policy', 20.146246), ('services', 19.319157), ('economic', 18.892807), ('new', 18.392326), ('road', 18.111328)]


SDG goal 3

[('health', 90.463715), ('ca