In [46]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
INPUT_FOLDER = "isw_to_text"
INPUT_DATA_FILE = "isw_text.csv"

OUTPUT_FOLDER = "ready_text"
OUTPUT_DATA_FILE = "ready_text.csv"
df=pd.read_csv(f"{INPUT_FOLDER}/{INPUT_DATA_FILE}", sep=";")

In [3]:
def remove_one_letter_word(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)

In [27]:
def remove_stop_words(data):
    stop_words = set(stopwords.words('english')) 
    stop_stop_words = {"no", "not"}
    stop_words = stop_words - stop_stop_words

    words = word_tokenize(str(data))

    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [6]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+—./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
        data = np.char.replace(data, ', ', ' ')
    return data


In [7]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [8]:
def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

In [9]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        if w.isdigit():
            if int(w)<100000000000:
                w = num2words(w)
            else:
                w = ' '
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "−", " ")

    return new_text

In [18]:
def remove_url_string(data):
    words = word_tokenize(str(data))

    new_text = ""
    for w in words:
        w = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(w), flags=re.MULTILINE)
        w = re.sub(r'^http?:\/\/.*[\r\n]*', '', str(w), flags=re.MULTILINE)

        new_text = new_text + " " + w

    return new_text

In [28]:
def preprocess(data, word_root_algo="lemm"):
    data = remove_one_letter_word(data)
    data = remove_url_string(data)
    data = convert_lower_case(data) # remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data) 
    data = stemming(data) 
    data = remove_punctuation(data)
    data = convert_numbers(data)
    if word_root_algo == "lemm":
        data = lemmatizing(data) 
    else:
        data = stemming(data)   
    data = remove_punctuation(data)
    data = remove_stop_words(data) 
    return data

In [30]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
df['lemm'] = df["main_html_v8"].apply(lambda x: preprocess(x,"lemm"))
df['stemm'] = df["main_html_v8"].apply(lambda x: preprocess(x,"stemm"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...


In [31]:
df

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v2,main_html_v8,lemm,stemm
0,2022-02-24,2022-02-24.html,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"\nMason Clark, George Barros, and Kateryna S...",Russian President Vladimir Putin began a large...,Russian President Vladimir Putin began a large...,russian presid vladimir putin began large-sca...,russian presid vladimir putin began large-sc ...
1,2022-02-25,2022-02-25.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Kateryna Step...",Russian forces entered major Ukrainian cities—...,Russian forces entered major Ukrainian cities—...,russian forc enter major ukrainian city inclu...,russian forc enter major ukrainian citi inclu...
2,2022-02-26,2022-02-26.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Katya Stepane...",\nRussian forces’ main axes of advance in the ...,Russian forces’ main axes of advance in the la...,russian forc main axe advanc last twenty-four...,russian forc main axe advanc last twenty-four...
3,2022-02-27,2022-02-27.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"\nMason Clark, George Barros, and Kateryna St...",\nThe Russian military has likely recognized t...,The Russian military has likely recognized tha...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...
4,2022-02-28,2022-02-28.html,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"\nMason Clark, George Barros, and Kateryna S...",The Russian military is reorganizing its milit...,The Russian military is reorganizing its milit...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...
...,...,...,...,...,...,...,...,...,...,...
325,2023-01-21,2023-01-21.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Grace Mappes, Angela Howard...",.\nThe Ukrainian defense of Bakhmut is likely ...,.The Ukrainian defense of Bakhmut is likely a ...,ukrainian defens bakhmut like strateg sound e...,ukrainian defen bakhmut like strateg sound ef...
326,2023-01-22,2023-01-22.html,"Russian Offensive Campaign Assessment, Januar...","Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nISW is publishing an abbreviated campaign u...,.ISW is publishing an abbreviated campaign upd...,isw publish abbrevi campaign updat today janu...,isw publish abbrevi campaign updat today janu...
327,2023-01-23,2023-01-23.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nUkrainian intelligence assessed that Russia...,.Ukrainian intelligence assessed that Russian ...,ukrainian intellig ass russian forc prepar of...,ukrainian intellig assess russian forc prepar...
328,2023-01-24,2023-01-24.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Riley Bailey, Grace Mappes,...",.\nA coalition of NATO member states reportedl...,.A coalition of NATO member states reportedly ...,coalit nato member state reportedli send ukra...,coalit nato member state reportedli send ukra...


In [32]:
df.to_csv(f"{OUTPUT_FOLDER}/{OUTPUT_DATA_FILE}", sep=";",index = False)

In [37]:
df = pd.read_csv(f"{OUTPUT_FOLDER}/{OUTPUT_DATA_FILE}", sep=";")

In [38]:
df

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v2,main_html_v8,lemm,stemm
0,2022-02-24,2022-02-24.html,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"\nMason Clark, George Barros, and Kateryna S...",Russian President Vladimir Putin began a large...,Russian President Vladimir Putin began a large...,russian presid vladimir putin began large-sca...,russian presid vladimir putin began large-sc ...
1,2022-02-25,2022-02-25.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Kateryna Step...",Russian forces entered major Ukrainian cities—...,Russian forces entered major Ukrainian cities—...,russian forc enter major ukrainian city inclu...,russian forc enter major ukrainian citi inclu...
2,2022-02-26,2022-02-26.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Katya Stepane...",\nRussian forces’ main axes of advance in the ...,Russian forces’ main axes of advance in the la...,russian forc main axe advanc last twenty-four...,russian forc main axe advanc last twenty-four...
3,2022-02-27,2022-02-27.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"\nMason Clark, George Barros, and Kateryna St...",\nThe Russian military has likely recognized t...,The Russian military has likely recognized tha...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...
4,2022-02-28,2022-02-28.html,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"\nMason Clark, George Barros, and Kateryna S...",The Russian military is reorganizing its milit...,The Russian military is reorganizing its milit...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...
...,...,...,...,...,...,...,...,...,...,...
325,2023-01-21,2023-01-21.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Grace Mappes, Angela Howard...",.\nThe Ukrainian defense of Bakhmut is likely ...,.The Ukrainian defense of Bakhmut is likely a ...,ukrainian defens bakhmut like strateg sound e...,ukrainian defen bakhmut like strateg sound ef...
326,2023-01-22,2023-01-22.html,"Russian Offensive Campaign Assessment, Januar...","Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nISW is publishing an abbreviated campaign u...,.ISW is publishing an abbreviated campaign upd...,isw publish abbrevi campaign updat today janu...,isw publish abbrevi campaign updat today janu...
327,2023-01-23,2023-01-23.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nUkrainian intelligence assessed that Russia...,.Ukrainian intelligence assessed that Russian ...,ukrainian intellig ass russian forc prepar of...,ukrainian intellig assess russian forc prepar...
328,2023-01-24,2023-01-24.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Riley Bailey, Grace Mappes,...",.\nA coalition of NATO member states reportedl...,.A coalition of NATO member states reportedly ...,coalit nato member state reportedli send ukra...,coalit nato member state reportedli send ukra...


In [41]:
docs = df["lemm"].values.tolist()

In [43]:
cv = CountVectorizer(max_df=0.98,min_df=2)
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

(330, 7384)

In [44]:
with open("model/count_vectorizer_v1.pkl", "wb") as handle:
    pickle.dump(cv, handle)

In [48]:
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True,)
tfidf_transformer.fit(word_count_vector)

In [50]:
with open("model/tfidf_transformer_v1.pkl", "wb") as handle:
    pickle.dump(tfidf_transformer, handle)

In [52]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names_out(), columns=["idf_weights"])
#  sort ascending
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
war,1.021375
staff,1.021375
luhansk,1.021375
attack,1.021375
area,1.021375
...,...
grinder,5.703506
grip,5.703506
grodno,5.703506
servicemen,5.703506


In [53]:
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [55]:
tfidf = pickle.load (open("model/tfidf_transformer_v1.pkl", "rb"))
cv = pickle.load(open("model/count_vectorizer_v1.pkl", "rb"))

In [56]:
# you cam only needs to do this once, this is mapping of index to
feature_names=cv.get_feature_names_out()

In [57]:
feature_names

array(['000', '03', '10', ..., 'zyuganov', 'коридор', 'сухопутный'],
      dtype=object)

In [63]:
def sort_coo(coo_matrix):
    tuples = zip (coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):


    # use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        # keen track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zio(feature_vals, score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

def conver_doc_to_vector (doc):
    feature_names = cv.get_feature_names_out()
    top_n = 100
    tf_idf_vector=tfidf.transform(cv.transform([doc]))
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    keywords=extract_topn_from_vector(feature_names,sorted_items, top_n)
    
    return keywords

In [64]:
df['keywords'] = df['lemm'].apply(lambda x: conver_doc_to_vector(x))

In [65]:
df

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v2,main_html_v8,lemm,stemm,keywords
0,2022-02-24,2022-02-24.html,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"\nMason Clark, George Barros, and Kateryna S...",Russian President Vladimir Putin began a large...,Russian President Vladimir Putin began a large...,russian presid vladimir putin began large-sca...,russian presid vladimir putin began large-sc ...,"{'pm': 0.378, 'airport': 0.258, 'kyiv': 0.239,..."
1,2022-02-25,2022-02-25.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Kateryna Step...",Russian forces entered major Ukrainian cities—...,Russian forces entered major Ukrainian cities—...,russian forc enter major ukrainian city inclu...,russian forc enter major ukrainian citi inclu...,"{'februari': 0.345, 'pm': 0.328, 'zero': 0.322..."
2,2022-02-26,2022-02-26.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"Mason Clark, George Barros, and Katya Stepane...",\nRussian forces’ main axes of advance in the ...,Russian forces’ main axes of advance in the la...,russian forc main axe advanc last twenty-four...,russian forc main axe advanc last twenty-four...,"{'februari': 0.442, 'kyiv': 0.371, 'twenty': 0..."
3,2022-02-27,2022-02-27.html,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"\nMason Clark, George Barros, and Kateryna St...",\nThe Russian military has likely recognized t...,The Russian military has likely recognized tha...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...,"{'februari': 0.545, 'seven': 0.292, 'twenty': ..."
4,2022-02-28,2022-02-28.html,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"\nMason Clark, George Barros, and Kateryna S...",The Russian military is reorganizing its milit...,The Russian military is reorganizing its milit...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...,"{'februari': 0.578, 'eight': 0.299, 'twenty': ..."
...,...,...,...,...,...,...,...,...,...,...,...
325,2023-01-21,2023-01-21.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Grace Mappes, Angela Howard...",.\nThe Ukrainian defense of Bakhmut is likely ...,.The Ukrainian defense of Bakhmut is likely a ...,ukrainian defens bakhmut like strateg sound e...,ukrainian defen bakhmut like strateg sound ef...,"{'januari': 0.508, 'teplinski': 0.225, 'milblo..."
326,2023-01-22,2023-01-22.html,"Russian Offensive Campaign Assessment, Januar...","Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nISW is publishing an abbreviated campaign u...,.ISW is publishing an abbreviated campaign upd...,isw publish abbrevi campaign updat today janu...,isw publish abbrevi campaign updat today janu...,"{'prigozhin': 0.608, 'putin': 0.331, 'januari'..."
327,2023-01-23,2023-01-23.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nRussian Offensive Campaign Assessment, Janu...",.\nUkrainian intelligence assessed that Russia...,.Ukrainian intelligence assessed that Russian ...,ukrainian intellig ass russian forc prepar of...,ukrainian intellig assess russian forc prepar...,"{'januari': 0.68, 'twenty': 0.279, 'thre': 0.2..."
328,2023-01-24,2023-01-24.html,"Russian Offensive Campaign Assessment, January...","Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"\nKarolina Hird, Riley Bailey, Grace Mappes,...",.\nA coalition of NATO member states reportedl...,.A coalition of NATO member states reportedly ...,coalit nato member state reportedli send ukra...,coalit nato member state reportedli send ukra...,"{'januari': 0.54, 'twenty': 0.194, 'bakhmut': ..."


In [66]:
OUTPUT_DATA_FILE2 = "vectorized.csv"
df.to_csv(f"{OUTPUT_FOLDER}/{OUTPUT_DATA_FILE2}", sep=";",index = False)