In [1]:
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from operator import itemgetter
nlp = spacy.load('en_core_web_sm')

In [40]:
import numpy as np

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [2]:
data = pd.read_csv("800-53 Mapping Mar 9.csv")

In [3]:
# Create control text dataframe for each pillar
identity_df = data.loc[data['Identity '] == 1][["Control Text"]] # remove the space after 'Identity' for new data
device_df = data.loc[data['Device'] == 1][["Control Text"]] 
net_env_df = data.loc[data['Network/Environment'] == 1][["Control Text"]]
app_workload_df = data.loc[data['Application Workload'] == 1][["Control Text"]]
data_df = data.loc[data['Data'] == 1][["Control Text"]]

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Claire\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Claire\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Claire\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [45]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()] 

def remove_stopwords(tokenized_column):
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

# def lemmatizer(tokenized_column):
#     wordnet_lemmatizer = WordNetLemmatizer()
#     return [wordnet_lemmatizer.lemmatize(word) for word in tokenized_column]

def rejoin_words(tokenized_column): 
    return (" ".join(tokenized_column))

In [48]:
# Identity Data
identity_df['Control Text']= identity_df['Control Text'].apply(lambda x: x.lower())
identity_df['tokenized'] = identity_df.apply(lambda x: tokenize(x['Control Text']), axis=1)
identity_df['stopwords_removed'] = identity_df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
# identity_df['msg_lemmatized'] = identity_df['stopwords_removed'].apply(lambda x: lemmatizer(x))
identity_df['rejoined'] = identity_df.apply(lambda x: rejoin_words(x['stopwords_removed']), axis=1)

with open('identity_text.txt', 'w') as f:
    f.write(identity_df['rejoined'].str.cat())

identity_text = open('identity_text.txt', encoding="utf8").read()


In [49]:
# Device Data
device_df['Control Text']= device_df['Control Text'].apply(lambda x: x.lower())
device_df['tokenized'] = device_df.apply(lambda x: tokenize(x['Control Text']), axis=1)
device_df['stopwords_removed'] = device_df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
# device_df['msg_lemmatized'] = device_df['stopwords_removed'].apply(lambda x: lemmatizer(x))
device_df['rejoined'] = device_df.apply(lambda x: rejoin_words(x['stopwords_removed']), axis=1)

with open('device_text.txt', 'w') as f:
    f.write(device_df['rejoined'].str.cat())

device_text = open('device_text.txt', encoding="utf8").read()


In [51]:
# Network / Environment Data
net_env_df['Control Text']= net_env_df['Control Text'].apply(lambda x: x.lower())
net_env_df['tokenized'] = net_env_df.apply(lambda x: tokenize(x['Control Text']), axis=1)
net_env_df['stopwords_removed'] = net_env_df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
# net_env_df['msg_lemmatized'] = net_env_df['stopwords_removed'].apply(lambda x: lemmatizer(x))
net_env_df['rejoined'] = net_env_df.apply(lambda x: rejoin_words(x['stopwords_removed']), axis=1)

with open('net_env_text.txt', 'w') as f:
    f.write(net_env_df['rejoined'].str.cat())

net_env_text = open('net_env_text.txt', encoding="utf8").read()


In [52]:
# Application Workload
app_workload_df['Control Text']= app_workload_df['Control Text'].apply(lambda x: x.lower())
app_workload_df['tokenized'] = app_workload_df.apply(lambda x: tokenize(x['Control Text']), axis=1)
app_workload_df['stopwords_removed'] = app_workload_df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
# app_workload_df['msg_lemmatized'] = app_workload_df['stopwords_removed'].apply(lambda x: lemmatizer(x))
app_workload_df['rejoined'] = app_workload_df.apply(lambda x: rejoin_words(x['stopwords_removed']), axis=1)

with open('app_workload_text.txt', 'w') as f:
    f.write(app_workload_df['rejoined'].str.cat())

app_workload_text = open('app_workload_text.txt', encoding="utf8").read()


In [53]:
# Data
data_df = data_df.drop(data_df.index[28])
data_df['Control Text']= data_df['Control Text'].apply(lambda x: x.lower())
data_df['tokenized'] = data_df.apply(lambda x: tokenize(x['Control Text']), axis=1)
data_df['stopwords_removed'] = data_df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
data_df['rejoined'] = data_df.apply(lambda x: rejoin_words(x['stopwords_removed']), axis=1)

with open('data_text.txt', 'w') as f:
    f.write(data_df['rejoined'].str.cat())

data_text = open('data_text.txt', encoding="utf8").read()


In [54]:
vectorizer = TfidfVectorizer()
corpus = [identity_text, device_text, net_env_text, app_workload_text, data_text]
X = vectorizer.fit_transform(corpus)

In [56]:
def top_tfidf_feats(row, features, top_n=25):
    '''
    Get top n TF-IDF values in row and return them with their corresponding feature names
    '''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    '''
    Top TF-IDF features in specific document (matrix row)
    '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [57]:
# Identity
feature_names = vectorizer.get_feature_names()
top_feats_in_doc(X, feature_names, 0)

Unnamed: 0,feature,tfidf
0,assignment,0.643319
1,access,0.250705
2,system,0.217593
3,information,0.17502
4,individuals,0.137178
5,security,0.132448
6,policy,0.127718
7,accounts,0.119668
8,personnel,0.118257
9,account,0.104118


In [58]:
# Device
feature_names = vectorizer.get_feature_names()
top_feats_in_doc(X, feature_names, 1)

Unnamed: 0,feature,tfidf
0,assignment,0.628649
1,system,0.350134
2,access,0.23077
3,devices,0.183024
4,information,0.135279
5,changes,0.127321
6,mobile,0.123025
7,remote,0.122309
8,policy,0.119364
9,components,0.119364


In [59]:
# Network/Environment
feature_names = vectorizer.get_feature_names()
top_feats_in_doc(X, feature_names, 2)

Unnamed: 0,feature,tfidf
0,assignment,0.654676
1,system,0.40963
2,information,0.1975
3,components,0.149954
4,systems,0.146296
5,following,0.135324
6,security,0.109722
7,access,0.09875
8,selection,0.095093
9,control,0.091435


In [60]:
# Application Workload
feature_names = vectorizer.get_feature_names()
top_feats_in_doc(X, feature_names, 3)

Unnamed: 0,feature,tfidf
0,assignment,0.577752
1,system,0.499412
2,components,0.166471
3,access,0.156678
4,changes,0.156678
5,security,0.146886
6,information,0.146886
7,policy,0.137094
8,subjects,0.117509
9,automated,0.107716


In [61]:
# Data
feature_names = vectorizer.get_feature_names()
top_feats_in_doc(X, feature_names, 4)

Unnamed: 0,feature,tfidf
0,assignment,0.495132
1,information,0.470979
2,security,0.326062
3,system,0.175108
4,access,0.156993
5,privacy,0.144917
6,attributes,0.138878
7,domains,0.135784
8,different,0.114225
9,filtering,0.114046
