In [1]:
import pandas as pd
from pandas import DataFrame
import collections
from collections import Counter
import re
import csv
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
%matplotlib inline
file_path = "/Users/spb/desktop/2019-05-01-2021-05-29-Ethiopia copy.csv"
df = pd.read_csv(file_path, sep= ',', header=0)

'''
This section is straight from David's preprocessing 
'''
def remove_num(list):
    pattern = '[0-9]'
    list = [re.sub(pattern, '', i) for i in list]
    return list
df["notes"] = remove_num(df["notes"])

tokenizer = RegexpTokenizer(r'\w+')
df["notes"] = df["notes"].apply(lambda x: tokenizer.tokenize(x.lower()))

lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text
df["notes"] = df["notes"].apply(lambda x: word_lemmatizer(x))

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words
df["notes"] = df["notes"].apply(lambda x: remove_stopwords(x))

def remove_month(text):
    dates = ['january', 'feburary', 'march','april','may','june','july','august',
             'september','october','november','december']
    words =[w for w in text if w not in dates]
    return words
df["notes"] = df["notes"].apply(lambda x: remove_month(x))


'''
This is the way I made separate corpuses for each event_type
'''
protests_df = df.loc[df['event_type'] == "Protests"]
df1 = protests_df['notes']
pro_tolist = df1.to_numpy().tolist()
pro_txt = " ".join(str(x) for x in pro_tolist)

battles_df = df.loc[df['event_type'] == "Battles"]
df1 = battles_df['notes']
bat_tolist = df1.to_numpy().tolist()
bat_txt = " ".join(str(x) for x in bat_tolist)

exremvio_df = df.loc[df['event_type'] == "Explosions/Remote violence"]
df1 = exremvio_df['notes']
exremvio_tolist = df1.to_numpy().tolist()
exremvio_txt = " ".join(str(x) for x in exremvio_tolist)

riots_df = df.loc[df['event_type'] == "Riots"]
df1 = riots_df['notes']
riots_tolist = df1.to_numpy().tolist()
riots_txt = " ".join(str(x) for x in riots_tolist)

stratdev_df = df.loc[df['event_type'] == "Strategic developments"]
df1 = stratdev_df['notes']
stratdev_tolist = df1.to_numpy().tolist()
stratdev_txt = " ".join(str(x) for x in stratdev_tolist)

vaciv_df = df.loc[df['event_type'] == "Violence against civilians"]
df1 = vaciv_df['notes']
vaciv_tolist = df1.to_numpy().tolist()
vaciv_txt = " ".join(str(x) for x in vaciv_tolist)


'''
Placed all the text into a list format for my matrices
'''
corpus_total = [pro_txt, bat_txt, exremvio_txt, riots_txt, stratdev_txt, vaciv_txt]


'''
vectorized data
'''
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus_total)

vectorizer1 = TfidfVectorizer()
Y = vectorizer1.fit_transform(corpus_total)

terms = vectorizer1.get_feature_names()
total_vocab = vectorizer1.vocabulary_

'''
created different matrices
tfidf matrix:
'''
dfmatrix1 = pd.DataFrame(Y.toarray().transpose(),
                   index=terms)
'''
term_freq. matrix:
'''
dfmatrix2 = pd.DataFrame(X.toarray().transpose(),
                   index=terms)
'''
set column names for these matrices based on the order in which I placed text in the list (corpus_total)
'pro_doc' = Protests
'bat_doc' = Battles
'erv_doc' = Explosions/remote viol.
'riot_doc' = Riots
'strdev_doc' = strategic development
'vac_doc' = violence against civs

'''
dfmatrix1.columns = ['pro_doc','bat_doc','erv_doc', 'riot_doc', 'strdev_doc', 'vac_doc']
dfmatrix2.columns = ['pro_doc','bat_doc','erv_doc', 'riot_doc', 'strdev_doc', 'vac_doc']

print('tf-idf_score_matrix')
print(dfmatrix1)
print('term_freq(tf)_matrix')
print(dfmatrix2)

'''
I used these files to ultimately map my cluster #s to event_types
'''
# dfmatrix1.to_csv("/Users/spb/desktop/tfidf_takealook.csv")
# dfmatrix2.to_csv("/Users/spb/desktop/doc_term_takealook.csv")


'''
I pre-set in # of clusters to 6 and fit the model to my corpus_total
Created cluster centriods that would be used to find words that best fit the particular cluster
'''
true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=3, random_state=10).fit(Y)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]

'''
Created a visual showing top tf-idf words(showing 10 for example) for each cluster. They are listed highest 
to lowest score in each cluster segment 
'''
def word_clustering(k_num, centriods, features):
    for i in range(k_num):
        print('Cluster %d:' % i),
        for ind in centriods[i, :10]:
            print(' %s' % features[ind])
word_clustering(true_k, order_centroids, terms)

'''
Here I attempt to load in the dataset to get a label prediction from the kmeans model setup, result is an array
of label predictions 0 to 5
'''
notes_input = df["notes"].tolist()
res = [' '.join(ele) for ele in notes_input]
char_remove = [re.compile(r'\W*\b\w{1,2}\b').sub("", i) for i in res]
test = vectorizer1.transform(char_remove)
prediction = model.predict(test)

'''
After that I take the same data set predications were ran on and I covert the event types to #s 0 to 5 
By labeling this here I am able to see what the model chose for each line in the entire document 
I was only able to map the cluster #s to the event type by printing the clusters words (pervious) and 
searching for thier scores in a exported cvs file
'''
df_raw = pd.read_csv(file_path, sep= ',', header=0)
df1_raw = df_raw.dropna(axis=0, how='any', thresh=None, subset=['notes'], inplace=False)
df2_raw = df1_raw.event_type.replace({"Violence against civilians": 2, "Battles": 0, 'Protests': 5, 
                             'Strategic developments': 3, 'Explosions/Remote violence': 4, 'Riots': 1})
orginal_array = df2_raw.to_numpy()
pred_array_cleaned = prediction

'''
This is my rudimentary way of counting all the event types of the actual file and then counting all the 
predictions of that file

I subtracted actaul from prediction count numbers. Sadly it has only proven to be 65% to 70% accurate depending 
on size of the intial document ingested. Using roughly two years worth of data had the highest accuracy 
compared to 20 years or even 6 months.
'''
elements_count = collections.Counter(pred_array_cleaned)
elements_count1 = collections.Counter(orginal_array)
print('\n')
print('prediction data:')
for key, value in elements_count.items():
    print(f"{key}: {value }")
print('\n')
print('orginal data:')
for key1, value1 in elements_count1.items():
    print(f"{key1}: {value1}")


tf-idf_score_matrix
           pro_doc   bat_doc   erv_doc  riot_doc  strdev_doc   vac_doc
aba       0.000000  0.000963  0.000000  0.000000    0.000000  0.003324
ababa     0.026436  0.000652  0.055470  0.022091    0.067777  0.014397
abadu     0.000000  0.000000  0.000000  0.005530    0.000000  0.000000
abala     0.006976  0.000000  0.000000  0.004534    0.000000  0.000000
abatimbo  0.000000  0.000587  0.000000  0.000000    0.000000  0.000000
...            ...       ...       ...       ...         ...       ...
zigem     0.000000  0.000587  0.000000  0.000000    0.000000  0.000000
ziway     0.003927  0.000000  0.000000  0.003828    0.000000  0.002807
zonal     0.000000  0.000000  0.000000  0.000000    0.030537  0.000000
zone      0.042801  0.016030  0.069337  0.027000    0.162665  0.149371
zufan     0.000000  0.000294  0.000000  0.000000    0.000000  0.000000

[2693 rows x 6 columns]
term_freq(tf)_matrix
          pro_doc  bat_doc  erv_doc  riot_doc  strdev_doc  vac_doc
aba            