In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# df_idf = pd.read_json("data/stackoverflow-data-idf.json", lines=True)
df_idf = pd.read_json("data/jsons/database1.json", lines=True)

## Text normalization

In [3]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
word_net_lemmatizer = WordNetLemmatizer()

In [7]:
# should be more sophisticated
def pre_process(text):
    # lowercase
    text = text.lower()
    
    #remove tags
    text = re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    
    text_words = nltk.word_tokenize(text)
    lemmatized_text = ""
    for word in text_words:
        lemma = word_net_lemmatizer.lemmatize(word)
        lemmatized_text += " " + lemma
    text = lemmatized_text[1:] 
    
    return text

In [8]:
df_idf['text'] = df_idf['body']
# df_idf['text'] = df_idf['title'] + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(x))

In [9]:
df_idf['text'][2]

'need a videographer to remove the backstage photo shoot dance video and maybe a vlog would love to have the camera removed'

## Stopwords

In [10]:
def get_stopwords(stop_words_file):
    with open(stop_words_file, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_set = (m.strip() for m in stop_words)
        return frozenset(stop_set)  

In [13]:
# stopwords = get_stopwords("data/stopwords/stopwords.txt")  

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

{'y', 'how', 'your', 'down', 'into', 'above', 'wasn', "that'll", "don't", 'with', 'will', 'shan', 'yourselves', 'any', 'wouldn', "won't", 'just', 'being', 'an', 'hasn', "didn't", 'off', "aren't", 'o', 'then', 'am', 'below', 'more', 'ourselves', 'that', 'be', 'isn', 'nor', 'were', 'such', 'itself', 'for', 'these', 'been', "doesn't", "wouldn't", "shouldn't", 'his', 'in', 'mustn', 'themselves', 'my', 'its', 'here', 'than', 'at', 'after', 'over', 'himself', 'most', 'what', 'of', "hadn't", 'now', 'doing', 'weren', 'but', 'under', 'hers', 'not', 'is', 'our', "wasn't", 'until', 'once', 'about', 'as', 'ours', 'm', 'herself', 'and', 'all', 'out', "you're", 'some', 'other', 'can', 'no', 'couldn', "mightn't", 'this', "couldn't", 'him', 'we', 'they', 'there', 'if', 's', 'her', 'didn', 'a', 'both', 'he', 'you', 'so', "mustn't", 'she', 'did', 'hadn', 'the', 'on', 'shouldn', 'are', 'aren', "you'll", 've', 'needn', "it's", 'won', 'i', 'does', 'where', 'up', 'theirs', 'should', 'or', 'll', 'to', 'befor

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df_idf['text'].tolist()
cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=10000)

word_count_vector = cv.fit_transform(docs)

In [15]:
word_count_vector.shape

(9000, 7138)

In [16]:
list(cv.vocabulary_.keys())[:10]

['wedding',
 'photographer',
 'need',
 'february',
 'hour',
 'charge',
 'bride',
 'check',
 'south',
 'west']

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [19]:
def sort_COO(COO_matrix):
    tuples = zip(COO_matrix.col, COO_matrix.data)
    return sorted(tuples, key=lambda x:(x[1], x[0]), reverse=True)

def topn(sorted_tuples, feature_names, topn):
    feature_list = []
    value_list = []
    sorted_tuples = sorted_tuples[:topn]
    for idx, value in sorted_tuples:
        feature_list.append(feature_names[idx])
        value_list.append(value)
    results = {}
    for idx in range(len(feature_list)):
        results[feature_list[idx]] = value_list[idx]
    return results

In [20]:
docs[0]

'wedding photographer need a photographer for a wedding february from to or hour charge the bride check the south west of moscow'

In [21]:
tfidf_vector_0 = tfidf_transformer.transform(cv.transform([docs[0]]))

In [22]:
feature_names=cv.get_feature_names()
sorted_tuples = sort_COO(tfidf_vector_0.tocoo())
keywords = topn(sorted_tuples, feature_names, 10)

In [23]:
for k in keywords:
    print(k, keywords[k])

west 0.408918139910497
wedding 0.39445350703255283
charge 0.382671737159973
south 0.36147329378014714
check 0.31765348550026157
february 0.3009251614760709
bride 0.2997670593120834
moscow 0.2146019947571679
photographer 0.20567555262653536
hour 0.15589967180316164


In [24]:
tfidf_vector_1 = tfidf_transformer.transform(cv.transform([docs[1]]))

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

cossim = cosine_similarity(tfidf_vector_0, tfidf_vector_1)

In [26]:
cossim

array([[0.]])