# Topic Analysis on DEC Keywords

In [1]:
# Import Packages
import pandas as pd
import numpy as np

In [2]:
# Import Data

dec_j2sr_data = pd.read_csv('Working_Data/dec_j2sr_data.csv')
print(dec_j2sr_data.shape)

(1149, 65)


In [3]:
# Most reports have nan values for the "New_Thesaurus_Terms" variable
dec_j2sr_data['New_Thesaurus_Terms'].isna().sum()

1093

In [4]:
# Descriptors_Topical, on the other hand, is rarely missing
dec_j2sr_data['Descriptors_Topical'].isna().sum()

8

In [5]:
dec_j2sr_data['Descriptors_Topical'] = dec_j2sr_data['Descriptors_Topical'].astype(str).str.replace(' ~\|~_©_~\|~', ",").str.lower()
dec_j2sr_data['Descriptors_Topical']

0       quality of care, orphans and vulnerable childr...
1       orphans and vulnerable children (ovc), health ...
2       climate change, disaster relief, vulnerable gr...
3       trade promotion, international trade, nontarif...
4       family health care, health service utilization...
                              ...                        
1144    accountability, family health care, family pla...
1145    hiv/aids, accountability, children, crimes, di...
1146    food security, poverty reduction, nutrition im...
1147    agricultural technology, crops, female empower...
1148    communes, communities, disease prevention and ...
Name: Descriptors_Topical, Length: 1149, dtype: object

In [6]:
topic_dummies = dec_j2sr_data['Descriptors_Topical'].str.get_dummies(sep=',')
print(topic_dummies.shape)
topic_dummies.head()

(1149, 1591)


Unnamed: 0,academic standards,accelerated education,access to credit,access to education,access to information,access to justice,access to resources,access to services,accidents,accountability,...,value chains,villages,violence,voters,water resources,water sanitation,water supply,watersheds,women's political leadership,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
word_counts = topic_dummies.apply(pd.Series.value_counts)
word_counts

Unnamed: 0,academic standards,accelerated education,access to credit,access to education,access to information,access to justice,access to resources,access to services,accidents,accountability,...,value chains,villages,violence,voters,water resources,water sanitation,water supply,watersheds,women's political leadership,youth
0,1148,1148,1137,1124,1137,1142,1127,1063,1148,1108,...,1148,1148,1148,1148,1148,1148,1146,1148,1148,1138
1,1,1,12,25,12,7,22,86,1,41,...,1,1,1,1,1,1,3,1,1,11


In [8]:
# Calculate the words used greater than n times and export value counts to a CSV file

n = 10

top_words = word_counts.iloc[1,].sort_values(ascending=False)
top_words = top_words[top_words > n]
top_words.to_csv("Working_Data/top_words.csv")
top_words.shape

(216,)

In [9]:
# Restrict topic_dummies to a subset containing only the top n words

topic_sample = topic_dummies.loc[:,top_words.index.values]

# replace 0 values with nan, drop rows and columns with missing values for all cells, then replace nan with 0 again
topic_sample = topic_sample.replace(0, np.nan)
topic_sample = topic_sample.dropna(how='all', axis=0)
topic_sample = topic_sample.dropna(how='all', axis=1)
topic_sample = topic_sample.replace(np.nan, 0)

In [10]:
print(topic_sample.shape)
topic_sample

(1125, 216)


Unnamed: 0,governance,disease prevention and control,households,women,economic development,communities,accountability,access to services,female empowerment,hiv/aids,...,basic education,weather,economic infrastructure,languages,newspapers,embargoes,land management,youth,condoms,case management
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1145,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1147,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Clusters of Categories with K-Modes Unsupervised Learning

In [11]:
# https://medium.com/@davidmasse8/unsupervised-learning-for-categorical-data-dd7e497033ae
# https://www.kaggle.com/ashydv/bank-customer-clustering-k-modes-clustering
# https://pypi.org/project/kmodes/

# Use the kmodes package to calculate K-nearest neighbors categories for each word to cluster into groups
# K-modes is similar to k-means but more applicable for binary data

from kmodes.kmodes import KModes

# define the k-modes model
km = KModes(n_clusters=20, init='Huang', n_init=10, verbose=1)

# fit the clusters to the skills dataframe
clusters = km.fit_predict(topic_sample)

# get an array of cluster modes
kmodes = km.cluster_centroids_
shape = kmodes.shape

# For each cluster mode (a vector of "1" and "0")
# find and print the column headings where "1" appears.
# If no "1" appears, assign to "no-skills" cluster.

for i in range(shape[0]):
    if sum(kmodes[i,:]) == 0:
        print("\ncluster " + str(i) + ": ")
        print("no-topic cluster")
    else:
        print("\ncluster " + str(i) + ": ")
        cent = kmodes[i,:]
        for j in topic_sample.columns[np.nonzero(cent)]:
            print(j)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 208, cost: 4967.0
Run 1, iteration: 2/100, moves: 0, cost: 4967.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 169, cost: 5000.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 180, cost: 4989.0
Run 3, iteration: 2/100, moves: 0, cost: 4989.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 192, cost: 5181.0
Run 4, iteration: 2/100, moves: 77, cost: 5145.0
Run 4, iteration: 3/100, moves: 45, cost: 5145.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 249, cost: 5048.0
Run 5, iteration: 2/100, moves: 13, cost: 5046.0
Run 5, iteration: 3/100, moves: 0, cost: 5046.0
Init: initializing centroids
Init: initializing clusters
St

In [123]:
#topic_sample.loc[:,'clusters'] = clusters

In [124]:
#topic_sample.to_csv("topic_sample.csv")