In [1]:
import numpy as np
import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.datasets import fetch_20newsgroups
X_train, y_train = fetch_20newsgroups(subset='train', return_X_y=True)
X_test, y_test = fetch_20newsgroups(subset='test', return_X_y=True)


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
tokenizer = RegexpTokenizer(r'\b\w{3,}\b')
stop_words = list(set(stopwords.words("english")))
punctuation = list(string.punctuation)

stop_words += punctuation
stop_words += ['__', '___']

In [5]:
def preprocess(string):
    
    processed_str = re.sub(r"\S+@\S+", '', string) #Remove Email
    processed_str = re.sub(r"\S+.co\S+", '', processed_str) #Remove Website
    processed_str = re.sub(r"\S+.ed\S+", '', processed_str) #Remove website
    processed_str = re.sub(r"\S+.in\S+", '', processed_str) #Remove website
    processed_str = re.sub(r"\S+.edu\S+", '', processed_str) #Remove website
    processed_str = re.sub(r"[0-9]+", '', processed_str) #Remove numbers
    return processed_str

In [6]:
X_train = list(map(preprocess, X_train))
X_test  = list(map(preprocess, X_test))

In [7]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, 
                        stop_words=stop_words, 
                        tokenizer=tokenizer.tokenize, 
                        max_df=0.2,
                        min_df=0.02
                       )

train_sparse = tfidf_vectorizer.fit_transform(X_train)
train_df = pd.DataFrame(train_sparse.toarray(), 
                        columns=tfidf_vectorizer.get_feature_names())
train_df.head()



Unnamed: 0,able,accept,access,across,act,action,actually,add,address,advance,...,written,wrong,wrote,yeah,year,years,yes,yet,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.116068,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35112,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.091827,0.0,0.0,0.0,0.08466,0.0,0.0,0.099914,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166543,0.0,...,0.0,0.0,0.146199,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.155993,0.155555,0.0,0.0


In [8]:
train_df.shape

(11314, 727)

In [9]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=20, n_iter=100, random_state=42) #Reducing dimersion to 20

In [10]:
#Performing SVD decomposition on the TF-IDF matrix
tfidf_lsa = lsa_model.fit_transform(train_df)
S = lsa_model.singular_values_
V_T = lsa_model.components_.T

In [11]:
V_T.shape

(727, 20)

In [12]:
tfidf_lsa

array([[ 0.17972472,  0.03122431, -0.0791767 , ..., -0.21201794,
         0.08919038, -0.03944861],
       [ 0.16654364,  0.09024264, -0.03261853, ...,  0.00136429,
        -0.05589712, -0.02454946],
       [ 0.31459888,  0.06313662, -0.03134122, ..., -0.00734951,
        -0.0363778 ,  0.06233438],
       ...,
       [ 0.22499107,  0.08187131,  0.01345797, ..., -0.05637219,
         0.01262503, -0.05570488],
       [ 0.21207439,  0.05893502,  0.02056072, ..., -0.02133107,
        -0.00325343, -0.04632079],
       [ 0.15187908,  0.04816709, -0.01286988, ...,  0.04248345,
         0.18839341, -0.13397436]])

In [13]:
tfidf_lsa.shape #Data dimensionality reduced from 727 to 20

(11314, 20)

In [14]:
#use any word of your choice
query = 'work' 
query_vector = []
for col in train_df.columns:
  if(query == col):
    query_vector.append(1)
    continue
  query_vector.append(0)

In [15]:
len(query_vector)

727

In [16]:
 rel = V_T.transpose()*query_vector
 np.count_nonzero(rel)

20

In [17]:
A = train_df.to_numpy()
rel2 = A*query_vector
np.count_nonzero(rel2)

1533