In [1]:
import pandas as pd
import matplotlib
import re
from tqdm import tqdm
import numpy as np
import scipy.sparse as sp

In [2]:
train = pd.read_csv("dataset/train.csv")
train

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP


In [3]:
train.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id26305,"This process, however, afforded me no means of...",EAP
freq,1,1,7900


In [4]:
unique_words = set()

for i in range(train.shape[0]):
    text = train["text"][i]
    words = re.findall("[a-z0-9]+", text.lower())
#     print(words[:5])
    unique_words.update(words)

In [5]:
len(unique_words)

25078

In [6]:
unique_words

{'perceiving',
 'predisposing',
 'rudiment',
 'twist',
 'typographical',
 'impromptu',
 'aaem',
 'strolling',
 'consults',
 'fantastic',
 'celebrity',
 'intensity',
 'fascinated',
 'ceased',
 'signed',
 'denis',
 'egypt',
 't',
 'donner',
 'garrulousness',
 'lore',
 'inscrutable',
 'vd',
 'redeem',
 'cup',
 'commented',
 'talons',
 'thronging',
 'consultations',
 'prospect',
 'disregarding',
 'genoese',
 'dewy',
 'fractured',
 'experimentally',
 'degrading',
 'gulph',
 'juice',
 'alchemy',
 'blue',
 'dove',
 'deliverer',
 'am',
 'bags',
 'fanatic',
 'encyclopaedia',
 'agraffas',
 'astonishing',
 'genealogy',
 'definitiveness',
 'bellows',
 'main',
 'baptism',
 'escondida',
 'accurate',
 'saracen',
 'torrents',
 'inexplicably',
 'steals',
 'desperadoes',
 'herb',
 'potency',
 'schroeter',
 'javelin',
 'occurring',
 'threatens',
 'southern',
 'chinlessness',
 'mal',
 'caked',
 'yelping',
 'hogsheads',
 'cartridge',
 'dilation',
 'vanish',
 'engineers',
 'incessant',
 'islet',
 'shrewd',


In [7]:
# Removing Stopwords

relevant_words = unique_words
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

for stopword in stopwords:
    try:
        relevant_words.remove(stopword)
    except KeyError:
        pass

In [8]:
len(relevant_words)

24951

In [9]:
columns = sorted(list(relevant_words))
columns

['aaem',
 'ab',
 'aback',
 'abaft',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abaout',
 'abased',
 'abasement',
 'abashed',
 'abashment',
 'abate',
 'abated',
 'abatement',
 'abating',
 'abb',
 'abbey',
 'abbeys',
 'abbreviation',
 'abdicated',
 'abdication',
 'abdications',
 'abdomen',
 'abdul',
 'abernethy',
 'aberrancy',
 'aberrant',
 'aberration',
 'aberrations',
 'abeyance',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abide',
 'abigail',
 'abijah',
 'abilities',
 'ability',
 'abject',
 'abjure',
 'ablaze',
 'able',
 'ably',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'aboard',
 'abode',
 'abodes',
 'abolished',
 'abominable',
 'abomination',
 'abominations',
 'aboriginal',
 'abortion',
 'abortions',
 'abortive',
 'abounded',
 'aboundingly',
 'abounds',
 'abra',
 'abreast',
 'abroad',
 'abrupt',
 'abruptly',
 'abruptness',
 'absconded',
 'absence',
 'absences',
 'absense',
 'absent',
 'absolute',
 'absolutely',
 'absolved',
 'absorb',
 

In [10]:
wordIndex = {}
for idx,word in enumerate(columns):
    wordIndex[word] = idx

In [11]:
# lil_matrix is better in changing sparse structure than csr_matrix.
dimX = train.shape[0]
dimY = len(columns)
featureMatrix = sp.lil_matrix((dimX,dimY),dtype=int)
featureMatrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
#  Mapping for each column : j => [i1,i2,i3...] i.e given a fixed j ,for all i such that featureMatrix[i,j] = 1 
trueRows = {}

for word in columns:
    trueRows[wordIndex[word]] = []

for i in range(train.shape[0]):
    text = train["text"][i]
    words = re.findall("[a-z0-9]+", text.lower())
    for word in words:
        if word in stopwords:
            continue
        trueRows[wordIndex[word]].append(i)

In [13]:
for col,rowlist in tqdm(trueRows.items()):
    featureMatrix[rowlist,col] = 1

100%|██████████| 24951/24951 [00:01<00:00, 15501.83it/s]


In [14]:
# Once lil_matrix has served its purpose , convert it to csr_matrix for faster arithmetic, vectorization
# and being able to save it

featureMatrix = featureMatrix.tocsr()

In [15]:
featureMatrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
len(featureMatrix.toarray()[0].tolist())

24951

In [17]:
# first training example word indices

text = train["text"][0]
words = re.findall("[a-z0-9]+", text.lower())
indices = [
    wordIndex[word] for word in words if word not in stopwords
]
indices = sorted(indices)
indices

[453,
 1286,
 1636,
 3702,
 6102,
 6862,
 8189,
 10730,
 13337,
 13640,
 13875,
 16003,
 16564,
 17069,
 18513,
 19478,
 19625,
 23225,
 24168,
 24416,
 24641]

In [18]:
# first training example word indices check with above
np.where(featureMatrix.toarray()[0]==1) == np.array(indices)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True]])

In [19]:
featureMatrix.shape

(19579, 24951)

In [20]:
sp.save_npz(file = "feature_matrix.npz", matrix = featureMatrix)