In [None]:
import pandas as pd
import re
import string
import nltk
import wordninja
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
print('imported')

In [None]:
df = pd.read_excel(r'C:\Users\garet\Documents\Python Scripts\OneDrive\Dissertation\Method\Coding\Validation sheet for hand coding.xlsx', encoding= 'UTF-8')
print('File loaded as Excel Workbook')

df.shape

In [None]:
df= df.drop(['Title', 'Publication', 'Headline', 'Short title', 'Short article', 'st', 'sa', 'Unnamed: 5', 'Unnamed: 6'], axis=1)

In [None]:
df['Hlead'].isnull().sum() #identifies any nan values (non present in df)
df['Hlead']= df['Hlead'].astype(str) #ignore nan values 

# Preprocessing 

In [None]:
df['Hlead']= df['Hlead'].str.lower()
df['Hlead']= df['Hlead'].str.replace('\d+', '')
df['Hlead']= df['Hlead'].str.replace('block-time', '')
df['Hlead']= df['Hlead'].str.replace('published-time', '')
df['Hlead']= df['Hlead'].str.replace('gmt', '')
df['Hlead']= df['Hlead'].str.replace('bst', '')
df['Hlead']= df['Hlead'].str.replace('aest', '')
df['Hlead']=df['Hlead'].str.replace('summary', '')
df['Hlead']= df['Hlead'].str.replace('pm', '')

for sentences in df['Hlead']:
    wordninja.split(sentences)

def remove_punctuation(text):
    text= ''.join([i for i in text if i not in string.punctuation])
    return text

df['Hlead'] = df['Hlead'].apply(remove_punctuation)

tokenizer = RegexpTokenizer(r'\w+')
df['Hlead']= df['Hlead'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
#Stopword removal and stemming
def remove_stops(text):
    text= [w for w in text if w not in stopwords.words('english')]
    return text

df['Hlead'] = df['Hlead'].apply(lambda x: remove_stops(x))
print('done')

stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text= ' '.join([stemmer.stem(i) for i in text])
    return stem_text

df['Hlead'] = df['Hlead'].apply(lambda x: word_stemmer(x))
print(df['Hlead'])

# Frequency count with SKlearn

In [None]:
#df.to_excel(r'C:\Users\garet\Documents\Python Scripts\OneDrive\Dissertation\Method\Coding\Dictionary\Dictionary count validation', index=False)
#print('''Saving complete. Year is now processed into two files (both in excel format): 
      #(1) preprocessed and (2) pre_processed and stemmed''')

In [None]:
vectorizer = CountVectorizer() #Instantiates SKlearn module
dtm = vectorizer.fit_transform(df['Hlead']).todense() #vectorises texts & transforms into documnet-term-matrix

In [None]:
dtm.shape #shows that there are x rows with x unique words. 

In [None]:
labels = vectorizer.get_feature_names()
index_dict = vectorizer.vocabulary_

keywords = ['abat', 'abm', 'abmt', 'abughraib', 'afghanistan', 'aggress', 'aid', 'airland', 'airpow', 'alert', 'allianc', 'alqaeda', 'alshabaab', 'ammunit', 'amphibi', 'anarchi', 'anthrax', 'antipersonnel', 'apt', 'arab', 'arm', 'armi', 'armedforc', 'armscontrol', 'armsrac', 'armstrad', 'assad', 'assassin', 'assault', 'asylum', 'asylumseek', 'atom', 'attack', 'author', 'baader', 'balanc', 'ballist', 'ballisticmissil', 'barrag', 'battl', 'binladen', 'bipolar', 'blitzkrieg', 'block', 'bloc', 'bokoharam', 'bomb', 'bombard', 'breivik', 'bullet', 'capabl', 'capitul', 'casualti', 'catastroph', 'ceasefir', 'central', 'intellig', 'agenc', 'charliehebdo', 'chechnya', 'cia', 'cluster', 'coerc', 'coercion', 'coerciv', 'collaps', 'coloni', 'combat', 'comp', 'comb', 'compel', 'compromis', 'concess', 'concili', 'conflict', 'congo', 'contain', 'control', 'cooper', 'counteract', 'counterinsurg', 'counterterror', 'counterterrorist', 'countervail', 'coup', 'crime', 'crimin', 'crisi', 'critic', 'cyberspher', 'cyberwar', 'daesh', 'damag', 'danger', 'decis', 'defenc', 'defend', 'defens', 'demilitaris', 'demilitiaris', 'demilitar', 'demobilis', 'destabilis', 'destabil', 'destruct', 'détent', 'destructionist', 'deter', 'deterr', 'dictat', 'dilemma', 'disarm', 'disarma', 'disast', 'diseas', 'disintegr', 'disobedi', 'disput', 'dissent', 'dissid', 'divis', 'domin', 'embargo', 'emerg', 'enemi', 'escal', 'evil', 'existenti', 'expeditionari', 'exploit', 'explos', 'extraordinari', 'faction', 'failedst', 'fascism', 'fascist', 'fear', 'fight', 'fought', 'firearm', 'fighter', 'firefight', 'fln', 'forbid', 'forbad', 'forbidden', 'forc', 'freedom', 'friction', 'fundament', 'fundamentalist', 'gang', 'ga', 'gass', 'gaz', 'genocid', 'guerrilla', 'guevara', 'gulf', 'hack', 'hacker', 'hama', 'hard', 'hazard', 'hebdo', 'hegemon', 'hegemoni', 'hezbollah', 'hijack', 'hiroshima', 'hiv', 'homeland', 'hostag', 'hussein', 'iaea', 'icc', 'ident', 'illeg', 'illegalis', 'illicit', 'immin', 'imminin', 'imperi', 'imperialist', 'incumb', 'infiltr', 'inhuman', 'insecur', 'insurg', 'interdepend', 'interdict', 'interpol', 'intervent', 'invad', 'invas', 'ira', 'iran', 'iraq', 'isil', 'isi', 'israel', 'isra', 'jihadi', 'jihad', 'jihadist', 'kidnap', 'kill', 'korea', 'kosovo', 'kurd', 'kurdistan', 'kuwait', 'landmin', 'law', 'lebanon', 'liber', 'libya', 'malacca', 'mercaneri', 'migrant', 'migrat', 'militarili', 'militar', 'militari', 'militarist', 'militaryindustrialcomplex', 'militia', 'militiaman', 'militiamen', 'misogyni', 'missil', 'munit', 'munition', 'nagasaki', 'narcot', 'nation', 'nationalist', 'nato', 'nazi', 'nazism', 'netwar', 'nonprolifer', 'nuclear', 'offens', 'oil', 'osc', 'overwhelm', 'pakistan', 'palestin', 'paramilitari', 'partisan', 'phillipin', 'plo', 'polic', 'poverti', 'power', 'preempt', 'preemption', 'prevent', 'prolifer', 'protect', 'punit', 'racism', 'racist', 'radic', 'radicalis', 'rape', 'refuge', 'rescu', 'resist', 'resolut', 'resourc', 'respons', 'retali', 'reveng', 'revolut', 'ricin', 'rival', 'rogu', 'rwanda', 'sadam', 'safe', 'salt', 'salw', 'sanction', 'sarin', 'saudism', 'scarc', 'scarcer', 'scarcest', 'scarciti', 'seapow', 'secur', 'securitis', 'securit', 'separatist', 'separat', 'septemb', '11', 'shock', 'shortag', 'somalia', 'sovereign', 'sovereignti', 'stabilis', 'strategi', 'strateg', 'strike', 'suicid', 'superpow', 'surg', 'surveil', 'syria', 'tactic', 'tactician', 'target', 'terror', 'terrorist', 'threat', 'threaten', 'tortur', 'traffic', 'traffick', 'transnat', 'uae', 'upris', 'valu', 'vietnam', 'violenc', 'violent', 'war', 'warcrim', 'warcrimin', 'warlord', 'weapon', 'wmd', 'wound'] #your list of keywords connoting securitization will go in here. 

key_word_index = [] 
for idx in keywords: #this function looks for all instances of my keywords in the dictionary and returns words which are not mentioned. 
    try:
        key_word_index.append(index_dict[idx])
    except:
        print("Can't find %s" % idx) #this will be useful to show which words do not feature at all in the design. 
        
print('done')

In [None]:
securitizing_words =[] 
#this looks at each unique instance in the dtm and appends the empty securitizing_words list to include each word in the dtm..
#which is also in the keyword index (so that no words that are not in the index are in there) and adds it together.
for i in range(dtm.shape[0]):
    securitizing_words.append(dtm[i,key_word_index].sum())

counts = pd.Series(securitizing_words).value_counts() #converts securitizing words into a pandas df with the counts of the unique values.
#The DF is in descending order= first element is the most frequently_occurring.
print(counts) #return frequency count of amount of times word occurs per article. 

In [None]:
counts.to_excel(r'C:\Users\garet\Documents\Python Scripts\OneDrive\Dissertation\Method\Coding\Dictionary\Dictionary count for validation.xlsx')