In [None]:
import pandas as pd
import numpy as np

In [None]:
import nltk
from nltk.corpus import stopwords
import re
import string

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = [
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'alt.atheism'
]
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))

In [None]:
df = pd.DataFrame(dataset.data, columns=["corpus"])
df.head()

Unnamed: 0,corpus
0,I just moved from Borland C++ 3.0 to Visual C+...
1,\n: \tNice cop out bill.\n\nI'm sure you're ri...
2,I'm attempting to transfer files from my home ...
3,\nThen not murdering would have no moral signi...
4,\nI could give much the same testimonial about...


In [None]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. tokenize
        tokens = nltk.word_tokenize(text)
        # 2. check if stopword
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. join back together
        text = " ".join(tokens)
    # return text in lower case and stripped of whitespaces
    text = text.lower().strip()
    return text

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stopwords.words("english")[:10] # <-- import the english stopwords
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
article_text = ""

for p in df['cleaned']:
    article_text += p

all_sentences = nltk.sent_tokenize(article_text)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# all_words

**BOW**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
 
X_train_counts = vectorizer.fit_transform(df['cleaned'])

 
# Printing the identified Unique words along with their indices
# print("Vocabulary: ", vectorizer.vocabulary_)
 
# Encode the Document
# vector = vectorizer.transform(df['cleaned']) 

# Summarizing the Encoded Texts
print("Encoded Document is:")
X_train_counts.toarray()

Encoded Document is:


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
print("Vocabulary: ", vectorizer.vocabulary_)

print(vectorizer.vocabulary_.get(u'moved'))

18924


**TfidfVectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# initialize the vectorizer
tf = TfidfVectorizer()

X = tf.fit_transform(df['cleaned'])
X

<2852x34245 sparse matrix of type '<class 'numpy.float64'>'
	with 172191 stored elements in Compressed Sparse Row format>

In [None]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

**Word2Vec CBOW**

In [None]:
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(all_words, 
                 min_count=3,   # Ignore words that appear less than this
                 workers=2,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30) 

In [None]:
vocabulary = model.wv.vocab
print(vocabulary)



In [None]:
model.wv.most_similar('mail')

[('thanks', 0.9998859167098999),
 ('anyone', 0.9998844265937805),
 ('much', 0.9998840689659119),
 ('world', 0.9998773336410522),
 ('look', 0.9998770952224731),
 ('say', 0.9998764991760254),
 ('education', 0.9998764395713806),
 ('could', 0.9998745918273926),
 ('high', 0.9998743534088135),
 ('please', 0.9998729228973389)]

In [None]:
model.wv.most_similar('moral')

[('religion', 0.9999416470527649),
 ('many', 0.9999408721923828),
 ('point', 0.9999348521232605),
 ('religious', 0.9999338388442993),
 ('atheist', 0.9999337196350098),
 ('christian', 0.9999336004257202),
 ('things', 0.9999336004257202),
 ('example', 0.9999324083328247),
 ('something', 0.9999322295188904),
 ('way', 0.9999316334724426)]

In [None]:
model.wv.similarity('mail', 'please')

0.999873

In [None]:
model = Word2Vec(all_words, 
                 min_count=3,   # Ignore words that appear less than this
                 workers=2,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30, sg=1) 

In [None]:
model.wv.most_similar('moral')

[('universe', 0.9967254400253296),
 ('feel', 0.9965368509292603),
 ('gods', 0.9958205223083496),
 ('ask', 0.9957951307296753),
 ('different', 0.9954997897148132),
 ('beliefs', 0.9953424334526062),
 ('meaning', 0.99485182762146),
 ('certainly', 0.9947781562805176),
 ('philosophical', 0.9947673678398132),
 ('behaviour', 0.9947668313980103)]

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text = [
    'Here is the first letter here.',
    'This document is the second letter.']

coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(text)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names_out())
df.head()

Unnamed: 0,document,first,here,is,letter,second,the,this
0,0,1,2,1,1,0,1,0
1,1,0,0,1,1,1,1,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Here is the first letter.',
    'This document is the second letter.']

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

['document' 'first' 'here' 'is' 'letter' 'second' 'the' 'this']


In [None]:
x

<2x8 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [None]:
df = pd.DataFrame(data=x.toarray(),columns = vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,document,first,here,is,letter,second,the,this
0,0.0,0.533098,0.533098,0.379303,0.379303,0.0,0.379303,0.0
1,0.470426,0.0,0.0,0.334712,0.334712,0.470426,0.334712,0.470426
