In [1]:
# in this file we are goona work on the text data, extract features from text and build models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df = pd.read_csv(r'datasets\crest-data-text.csv')

In [3]:
df.shape

(75000, 3)

In [4]:
df.columns

Index(['title', 'collection', 'collection_labels'], dtype='object')

In [5]:
df.tail(20)

Unnamed: 0,title,collection,collection_labels
74980,TTB/PNE TALKS CORRECTION OF ERRATA IN HUNGARIA...,Misc,4
74981,GROUND PHOTO CAPTION CARD 1249104 - 1249153 GR...,Misc,4
74982,GROUND PHOTO CAPTION CARD 1332733 - 1332782 GR...,Misc,4
74983,CY SULZBERGER'S VISA REQUEST POSSIBLE CIA COUR...,Misc,4
74984,GROUND PHOTO CAPTION CARD 1193034 - 1192989 (U...,Misc,4
74985,PRENSA LATINA POLLS PEKING 'OBSERVERS' ON NIXO...,Misc,4
74986,"CABLE TO THE WHITE HOUSE, HENRY A. KISSINGER F...",Misc,4
74987,"PROJECT NUMBER: 8802, SESSION NUMBER: 2, VIEWE...",Misc,4
74988,Q&A SESSION WITH SECRETARY OF DEFENSE CASPAR W...,Misc,4
74989,GROUND PHOTO CAPTION CARD 1389170 - 1389216 GR...,Misc,4


In [7]:
# split the titles as training data set
#train = df[['title']]
#train.columns

Index(['title'], dtype='object')

In [6]:
# 1. Feature extraction : number of words
# basic idea is to extract number words in each title 
# intuition behind with this technique is - some collection/categories need longer title than others

# make a new feature  word_count and calculate the words by splitting the strin on white space
df['title_word_count'] = df['title'].apply( lambda x : len(str(x).split(" ") ) )

df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count
0,BRIEFING TO COMPTROLLER'S OFFICE ON CLAS DIREC...,General_CIA_Records,0,51
1,UNAUTHORIZED DISCLOSURES OF CLASSIFIED INFORMA...,General_CIA_Records,0,28
2,"WARNOW SHIPYARD, WARNEMUENDE POLSKA ZEGLUGA MO...",General_CIA_Records,0,42
3,SOVIET MILITARY SHIPMENTS SOVIET MILITARY SHIP...,General_CIA_Records,0,25
4,VITAL MATERIALS DEPOSITS VITAL MATERIALS PROGR...,General_CIA_Records,0,24


In [14]:
# 2. Feature : Average word length (Sum categories might need longer words than others)
# formula : Sum (length of all the words in the tweet or doc ) / (total length of the tweet or doc)

def avg_word_length(sentence):
    words = sentence.split()
    return ( sum( len(word) for word in words ) / len(words))

df['avg_word_len'] = df['title'].apply( lambda x: avg_word_length(x) )

# print
df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,BRIEFING TO COMPTROLLER'S OFFICE ON CLASDIRECT...,General_CIA_Records,0,46,6.434783
1,UNAUTHORIZED DISCLOSURES OF CLASSIFIED INFORMA...,General_CIA_Records,0,23,8.782609
2,"WARNOW SHIPYARD, WARNEMUENDEPOLSKA ZEGLUGA MOR...",General_CIA_Records,0,37,7.513514
3,SOVIET MILITARY SHIPMENTSSOVIET MILITARY SHIPM...,General_CIA_Records,0,20,8.2
4,VITAL MATERIALS DEPOSITSVITAL MATERIALS PROGRA...,General_CIA_Records,0,19,8.263158


In [None]:
# let's start cleaning the data 

In [15]:
# 1. Lower casing  - change all the words to lower case to avoid duplication. Because "Python" and "python" considered 2 words
# we will split the title into words and then convert those words into lower case and then join

df['title'] = df['title'].apply( lambda x : " ".join( x.lower() for x in x.split() ) )
df['title'].head(5)

0    briefing to comptroller's office on clasdirect...
1    unauthorized disclosures of classified informa...
2    warnow shipyard, warnemuendepolska zegluga mor...
3    soviet military shipmentssoviet military shipm...
4    vital materials depositsvital materials progra...
Name: title, dtype: object

In [16]:
# 2. Remove punctuations and special charactaers
df['title'] = df['title'].str.replace('[^\w\s]','')
# the [^\w\s] means remove everything, keep only words(w) and spaces(s)
# this step should be done after feature extraction like hashtags, user tagged
df['title'].head()

0    briefing to comptrollers office on clasdirecto...
1    unauthorized disclosures of classified informa...
2    warnow shipyard warnemuendepolska zegluga mors...
3    soviet military shipmentssoviet military shipm...
4    vital materials depositsvital materials progra...
Name: title, dtype: object

In [21]:
# 3 . Removal of stop words - the, a , and etc. These are most commonly occuring words, and may created irrevelent 
# baises to our model

#import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# we are gonna split the title into words and then filter the stop words and join them back
df['title'] = df['title'].apply( lambda x : " ".join( x for x in x.split() if x not in stop) )
df['title'].head(5)

0    briefing comptrollers office clasdirectorate a...
1    unauthorized disclosures classified informatio...
2    warnow shipyard warnemuendepolska zegluga mors...
3    military shipmentssoviet military shipmentssov...
4    vital materials depositsvital materials progra...
Name: title, dtype: object

In [19]:
# 4. Frequent word removal from the text; text which are not stopwords
# first we will take whole tweets data and split into words and then calculate their frequency
#  join words only with strings so, there needs to be some string to join other string
all_words = ' '.join( df['title'] ).split()
freq = pd.Series(all_words).value_counts()[:20]
freq

abstract       64740
fr             33157
photo          22458
caption        22263
card           22122
development    17031
back           16336
scientific     15989
report         15645
v              14087
session        11647
viewer          9601
project         8846
mission         8480
soviet          7316
research        7280
ii              6967
ussr            6915
zhurnal         6910
g               6403
dtype: int64

In [20]:
# 4a)the freq words might add some bias to our model and not contribute to our model 
# remove those common words which are  ouccring more than 10,000 times using "freq"
df['title'] = df['title'].apply( lambda x: " ".join( x for x in x.split() if x not in freq ) )
df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,briefing comptrollers office clasdirectorate a...,General_CIA_Records,0,46,6.434783
1,unauthorized disclosures classified informatio...,General_CIA_Records,0,23,8.782609
2,warnow shipyard warnemuendepolska zegluga mors...,General_CIA_Records,0,37,7.513514
3,military shipmentssoviet military shipmentssov...,General_CIA_Records,0,20,8.2
4,vital materials depositsvital materials progra...,General_CIA_Records,0,19,8.263158


In [None]:
# 4a, remove words which are less than 2 letters, might be type while converting from OCR
# -- no need, because those words could be abbreviations and have special coded meaning
#df['title'] = df['title'].apply( lambda x: " ".join( x for x in x.split() if len(x)>2 ) )

In [22]:
# 4c) remove untitled word, becuase it was given when title was not present properly
df['title'] = df['title'].apply( lambda x: " ".join( x for x in x.split() if x not in ["UNTITLED"] ) )
df.head(5)


Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,briefing comptrollers office clasdirectorate a...,General_CIA_Records,0,46,6.434783
1,unauthorized disclosures classified informatio...,General_CIA_Records,0,23,8.782609
2,warnow shipyard warnemuendepolska zegluga mors...,General_CIA_Records,0,37,7.513514
3,military shipmentssoviet military shipmentssov...,General_CIA_Records,0,20,8.2
4,vital materials depositsvital materials progra...,General_CIA_Records,0,19,8.263158


In [25]:
# 5. Remove Rare words, which will not contribute to our model
all_words = ' '.join( df['title'] ).split()
rarely = pd.Series(all_words).value_counts()[-200000:]
rarely.sort_values

<bound method Series.sort_values of road                       282
travel                     281
five                       281
1951                       281
description                281
rybnoe                     281
associated                 281
enlarger                   281
nauch                      280
kommunist                  280
ns                         280
section                    280
print                      280
kozlov                     280
metals                     280
info                       279
republic                   279
handling                   278
saryshagan                 278
kudryavtsev                278
airborne                   278
1116                       278
made                       278
camp                       277
odessa                     276
district                   276
advisory                   275
pp                         275
fm                         275
ussrnew                    275
                          ... 
efr

In [27]:
# the abover words might not contribute to our model
# remove those rare words 
df['title'] = df['title'].apply( lambda x: " ".join( x for x in x.split() if x not in rarely ) )
df['title'].head(5)

0    briefing office administration fy executive hu...
1                 classified committee request richard
2    shipyard present polish school information 2 g...
3              military military military supply depot
4    materials materials materials system materials...
Name: title, dtype: object

In [29]:
# Check again for  rare words
all_words = ' '.join( df['title'] ).split()
rarely = pd.Series(all_words).value_counts()
rarely.sort_values

<bound method Series.sort_values of streak                      6382
economic                    6355
system                      6290
3                           6235
referativnyy                6216
intelligence                6144
plan                        6135
na                          6029
military                    6020
n                           5909
sun                         5840
l                           5759
space                       5720
russian                     5707
nuclear                     5633
b                           5477
first                       5448
identifier                  5421
information                 5401
rv                          5332
engineering                 5311
hungarian                   5294
phase                       5162
present                     5159
tests                       5068
radioactive                 5059
025                         5043
fiveyear                    5028
north                       5021
english

In [30]:
# 6. Spelling correction  -- not  necessary just need to remove the typos (rare words, which are accidently introduced)

# most common issue to deal with; spelling mistake, typos, shortcuts, abbreviatiosn (very common in when typing)
# spelling correction also help remove duplication that created by spelling mistakes like python, pythn, pythom etc.

# ** TextBlob is a Python library for processing textual data.
# Can be used for common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification,
# translation, and more.

# Spelling correction takes a lot of time. Also, we cannot always expect it to be accurate. 
# words are often used in their abbreviated form. For instance, ‘your’ is used as ‘ur’. We should treat this before the spelling
# correction step, otherwise these words might be transformed into any other word.

from textblob import TextBlob

# let's try spelling correction on 5 titles
# TextBlob takes one title and checks for each word for correction and corrects spellings and returns TextBlob; we can convert 
# into a string
#type(TextBlob("ysfunctional selfish drags kiss dysfu").correct())


# !!!!!!!!!!!!!!!!!!!!!!!!  Warning: This step is gonna take too much time maybe a few hours
#train['title'].apply( lambda x :  str( TextBlob(x).correct() ) )


In [31]:
# 7 . Lemmatization ; its preferred over stemming because if finds the root word
import nltk
nltk.download('wordnet')
from textblob import Word 



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
# perform lemmatization on the title
df['title'] = df['title'].apply( lambda x : " ".join( [Word(word).lemmatize() for word in x.split()  ]) )
df['title'].head(5)

0    briefing office administration fy executive hu...
1                 classified committee request richard
2    shipyard present polish school information 2 g...
3              military military military supply depot
4    material material material system material mat...
Name: title, dtype: object

In [34]:
# we have two features now, let's have look
df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,briefing office administration fy executive hu...,General_CIA_Records,0,46,6.434783
1,classified committee request richard,General_CIA_Records,0,23,8.782609
2,shipyard present polish school information 2 g...,General_CIA_Records,0,37,7.513514
3,military military military supply depot,General_CIA_Records,0,20,8.2
4,material material material system material mat...,General_CIA_Records,0,19,8.263158


In [36]:
# better save the file to avoid time consuming re-work on spelling correction and lemmatization
df.to_csv(r'datasets\1a-crest-after-text-cleaning.csv', 
                    index=False)

In [37]:
# Now our data cleaning has almost been done. It's time to extract more features :
# 1. n-grams
# 2. tf-idf
# 3. bag of words

df = pd.read_csv(r'datasets\1a-crest-after-text-cleaning.csv')

In [58]:
df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,briefing office administration fy executive hu...,General_CIA_Records,0,46,6.434783
1,classified committee request richard,General_CIA_Records,0,23,8.782609
2,shipyard present polish school information 2 g...,General_CIA_Records,0,37,7.513514
3,military military military supply depot,General_CIA_Records,0,20,8.2
4,material material material system material mat...,General_CIA_Records,0,19,8.263158


In [56]:
# calculating tf-idf using scikitlearn
# sublinear_df  - is set to True to use a logarithmic form for frequency.
# min_df - is the minimum numbers of documents a word must be present in to be kept.
# norm - is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range -  is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words - is set to "english" to remove all common pronouns ("a", "the", ...) to reduce the number of noisy features.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=50, norm='l2', encoding='latin-1', ngram_range=(1, 3))

features = tfidf.fit_transform(df.title.values.astype('str')).toarray()
labels = df.collection_labels

In [42]:
# let's find some interesting corelated unigrams and bigrams and trigrams
from sklearn.feature_selection import chi2
import numpy as np


# create a dataframe to lookup which label value belongs to which collection?
#category_id_df = df[['Product', 'category_id']].drop_duplicates().sort_values('category_id')
collection_labels_df = df[['collection', 'collection_labels']].drop_duplicates().sort_values('collection_labels')


category_to_id = dict(collection_labels_df.values)
id_to_category = dict(collection_labels_df[['collection', 'collection_labels']].values)

N = 5
for collection, collection_labels in sorted(category_to_id.items()):
    
    
    features_chi2 = chi2(features, labels == collection_labels)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    
    print("# '{}':".format(collection))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))
    print("  . Most correlated trigrams:\n       . {}".format('\n       . '.join(trigrams[-N:])))

# 'Consolidated_Translations':
  . Most correlated unigrams:
       . dok
       . meteorologiya
       . nauk
       . author
       . european
  . Most correlated bigrams:
       . zhurnalfrreferativ zhurfiz
       . khimiyafravramenko toreferativnyi
       . zhurfiz toreferativnyy
       . nauk sssr
       . dok nauk
  . Most correlated trigrams:
       . zhurfiz toreferativnyy zhurnalkhimrussia
       . dok nauk sssr
       . khimiyafravramenko toreferativnyi zhurnalfrreferativ
       . toreferativnyi zhurnalfrreferativ zhurfiz
       . zhurnalfrreferativ zhurfiz toreferativnyy
# 'General_CIA_Records':
  . Most correlated unigrams:
       . intelligence
       . william
       . weekly
       . security
       . cia
  . Most correlated bigrams:
       . william casey
       . director central
       . william webster
       . central intelligence
       . week ending
  . Most correlated trigrams:
       . concerning work site
       . week ending october
       . voucher purchase s

In [67]:
# further data exploration
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df.title.values.astype('str'))

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index


Unnamed: 0,01,025,079photographs,10,1064555,1064604ground,1097550,11,1121391ground,1125198,...,yev,yeva,yu,zhur,zhurfiz,zhurnalfrreferativ,zhurnalkhimatics,zhurnalkhimfryefremenko,zhurnalkhimrussia,zscientific
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
data_dtm.columns

Index(['01', '025', '079photographs', '10', '1064555', '1064604ground',
       '1097550', '11', '1121391ground', '1125198',
       ...
       'yev', 'yeva', 'yu', 'zhur', 'zhurfiz', 'zhurnalfrreferativ',
       'zhurnalkhimatics', 'zhurnalkhimfryefremenko', 'zhurnalkhimrussia',
       'zscientific'],
      dtype='object', length=766)