In [1]:
# Dataset :- CIA Records Search Tool Metadata - https://data.world/cia-crest-files/cia-crest-archive-metadata
# Data file :-  crest_lite_4.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# this file is separating data into two files - textual and numerical/categorical after converting the class labels into 
# numerical features

In [2]:
df = pd.read_csv(r'datasets\crest-after-step0.csv')

In [3]:
df.head(4)

Unnamed: 0,title,collection,document_number,release_decision,document_page_count,sequence_number,publication_date,content_type
0,BRIEFING TO COMPTROLLER'S OFFICE ON CLAS DIREC...,General_CIA_Records,CIA-RDP88G01332R001301470016-9,RIPPUB,3.0,16,"October 31, 1986",MEMO
1,UNAUTHORIZED DISCLOSURES OF CLASSIFIED INFORMA...,General_CIA_Records,CIA-RDP94B00280R001200040002-0,RIPPUB,10.0,2,"June 22, 1983",MEMO
2,"WARNOW SHIPYARD, WARNEMUENDE POLSKA ZEGLUGA MO...",General_CIA_Records,CIA-RDP80-00810A002500690001-1,RIPPUB,4.0,1,"November 3, 1953",REPORT
3,SOVIET MILITARY SHIPMENTS SOVIET MILITARY SHIP...,General_CIA_Records,CIA-RDP82-00457R008500360004-9,RIPPUB,2.0,4,"August 13, 1951",REPORT


In [4]:
# change collection into the numerical values with factorize() method
#df['category_id'] = df['Product'].factorize()[0]
df['collection_labels'] = df['collection'].factorize()[0]

df['collection_labels'].value_counts()

4    15000
3    15000
2    15000
1    15000
0    15000
Name: collection_labels, dtype: int64

In [5]:
collection_labels_df = df[['collection', 'collection_labels']].drop_duplicates().sort_values('collection_labels')
collection_labels_df

Unnamed: 0,collection,collection_labels
0,General_CIA_Records,0
15000,NGA_Records,1
30000,Scientific_Abstracts,2
45000,Consolidated_Translations,3
60000,Misc,4


In [9]:
# save/pickle the the collection labels dataframe for future use 
import pickle

pickle_save = open(r"pickle-intermediate-data\collection_labels_df.pickle","wb")
pickle.dump(collection_labels_df, pickle_save)
pickle_save.close()


In [6]:
# converting to dict 
label_dict = collection_labels_df.to_dict() 

In [7]:
print(label_dict)

{'collection': {0: 'General_CIA_Records', 15000: 'NGA_Records', 30000: 'Scientific_Abstracts', 45000: 'Consolidated_Translations', 60000: 'Misc'}, 'collection_labels': {0: 0, 15000: 1, 30000: 2, 45000: 3, 60000: 4}}


In [6]:
df.shape

(75000, 9)

In [7]:
df.columns

Index(['title', 'collection', 'document_number', 'release_decision',
       'document_page_count', 'sequence_number', 'publication_date',
       'content_type', 'collection_labels'],
      dtype='object')

In [8]:
df.tail(7)

Unnamed: 0,title,collection,document_number,release_decision,document_page_count,sequence_number,publication_date,content_type,collection_labels
74993,SESSION INFORMATION. TASK/TARGET NO: 92-53-L. ...,Misc,CIA-RDP96-00789R002301250001-8,RIFPUB,4.0,1,"May 6, 1992",NOTES,4
74994,203653 - 203753 220124- 220223 222673- 222772 ...,Misc,CIA-RDP78-05867A000200480001-5,RIPPUB,100.0,1,"October 4, 1974",CAPCARD,4
74995,SESSION INFORMATION TARGET 93-211-P VIEWER 079...,Misc,CIA-RDP96-00789R002500190013-1,RIPPUB,3.0,13,"October 26, 1993",SUMMARY,4
74996,TRANSCRIPT REMOTE VIEWING SESSION 842 TRANSCRI...,Misc,CIA-RDP96-00788R000700150001-9,RIFPUB,22.0,1,"January 5, 1982",REPORT,4
74997,FEEDBACK ON STAR GATE PROJECTS 93-210 AND 93-2...,Misc,CIA-RDP96-00789R002500050009-1,RIPPUB,4.0,9,"December 3, 1993",LETTER,4
74998,INFORMATION ITEMS I. INTRODUCTION 1. WE BELIEV...,Misc,LOC-HAK-550-2-66-6,RIPLIM,3.0,66,"October 31, 1974",CABLE,4
74999,GROUND PHOTO CAPTION CARD 1239723 - 1239772 GR...,Misc,CIA-RDP78-05867A000621530010-2,RIPPUB,44.0,10,"October 4, 1976",CAPCARD,4


In [9]:
# let's save the text data with labels into one file and numeric in another
data_text = df[['title' , 'collection', 'collection_labels']]

data_numeric = df[['collection', 'document_number', 'release_decision', 'document_page_count', 'sequence_number',
                   'publication_date', 'content_type', 'collection_labels']]
# Save text data file with the labels
data_text.to_csv(r'datasets\crest-data-text.csv', index=False)
# save numeric data file with the labels
data_numeric.to_csv(r'datasets\crest-data-numeric.csv', index=False)

In [14]:
# explore the data
# document page count
print("Number of pages in the document varies from "+ str(df['document_page_count'].max()) +
      " to "+str(df['document_page_count'].min()))


Number of pages in the document varies from 1136.0 to 1.0


In [18]:
df['collection'][df['content_type'] == 'MEMO'].value_counts()

General_CIA_Records    3109
Misc                   1438
NGA_Records             324
Name: collection, dtype: int64

In [20]:
df['content_type'].value_counts()

ABSTRACTS              15000
SCIENTIFIC ABSTRACT    12956
REPORT                 11951
CAPCARD                 6709
MEMO                    4871
CABLE                   4340
MF                      2475
OPEN SOURCE             2224
SCIENCEAB               1617
MISC                    1513
LETTER                  1500
REQ                     1274
FORM                    1038
NSPR                     835
IR                       772
MFR                      707
SUMMARY                  680
BRIEF                    623
HW                       616
NOTES                    504
SCIENTIFIC ABSTR         372
CONT                     215
MIN                      213
REGULATION               204
LIST                     197
IM                       181
PERRPT                   164
OPEN                     151
PHOTO                    150
STATEMENT                 98
PAPER                     86
RP                        85
BULL                      70
SS                        63
MAGAZINE      