In [2]:
import pandas as pd
import re

In [3]:
df_idf = pd.read_json("data/stackoverflow-data-idf.json", lines=True)

In [45]:
# should be more sophisticated
def pre_process(text):
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [5]:
df_idf['text'] = df_idf['title'] + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(x))

In [7]:
df_idf['text'][2]

'gradle command line i m trying to run a shell script with gradle i currently have something like this def test project tasks create test exec commandline bash c bash c my file dir script sh the problem is that i cannot run this script because i have spaces in my dir name i have tried everything e g commandline bash c bash c my file dir script sh tokenize commandline bash c bash c my file dir script sh commandline bash c new stringbuilder append bash append c my file dir script sh commandline bash c bash c my file dir script sh file dir file c my file dir script sh commandline bash c bash dir getabsolutepath im using windows bit and if i use a path without spaces the script runs perfectly therefore the only issue as i can see is how gradle handles spaces '

In [25]:
def get_stopwords(stop_words_file):
    with open(stop_words_file, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_set = (m.strip() for m in stop_words)
        return frozenset(stop_set)
    
stopwords = get_stopwords("data/stopwords/stopwords.txt")    

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df_idf['text'].tolist()
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)

word_count_vector = cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [30]:
word_count_vector.shape

(20000, 10000)

In [32]:
list(cv.vocabulary_.keys())[:10]

['serializing',
 'private',
 'struct',
 'public',
 'class',
 'contains',
 'properties',
 'string',
 'serialize',
 'attempt']

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [36]:
tfidf_transformer.idf_

array([7.37717703, 9.80492526, 9.51724319, ..., 9.51724319, 8.19548735,
       8.33858819])

In [59]:
tfidf_vector_0 = tfidf_transformer.transform(cv.transform([docs[0]]))

In [44]:
feature_names

['__',
 '__asm',
 '__b',
 '__call__',
 '__construct',
 '__cxx',
 '__dict__',
 '__dirname',
 '__file__',
 '__gnu_cxx',
 '__init__',
 '__main__',
 '__name__',
 '__str__',
 '_a',
 '_alt',
 '_blank',
 '_cb_',
 '_cbc_sha',
 '_channel',
 '_children',
 '_click',
 '_conf',
 '_context',
 '_cookie',
 '_data',
 '_db',
 '_e',
 '_emit',
 '_encode',
 '_files',
 '_get',
 '_id',
 '_img',
 '_index',
 '_instance',
 '_irqhandler',
 '_links',
 '_load',
 '_m',
 '_m_',
 '_max',
 '_missing',
 '_name',
 '_post',
 '_product',
 '_request',
 '_self',
 '_server',
 '_session',
 '_size',
 '_source',
 '_t',
 '_tab',
 '_target',
 '_txt',
 '_type',
 '_unicode_ci',
 '_user',
 '_v',
 '_viewmodel',
 '_work',
 '_wrapper',
 '_x',
 '_zn',
 '_znst',
 'aa',
 'aaa',
 'aaf',
 'aar',
 'ab',
 'abaqus',
 'abbr',
 'abbrev',
 'abc',
 'abcd',
 'abi',
 'ability',
 'able',
 'abort',
 'aborted',
 'abs',
 'absences',
 'absent',
 'absolute',
 'absolutely',
 'absquery',
 'abstract',
 'abstractapplicationcontext',
 'abstractautowirecapableb

In [54]:
def sort_COO(COO_matrix):
    tuples = zip(COO_matrix.col, COO_matrix.data)
    return sorted(tuples, key=lambda x:(x[1], x[0]), reverse=True)

def topn(sorted_tuples, feature_names, topn):
    feature_list = []
    value_list = []
    sorted_tuples = sorted_tuples[:topn]
    for idx, value in sorted_tuples:
        feature_list.append(feature_names[idx])
        value_list.append(value)
    results = {}
    for idx in range(len(feature_list)):
        results[feature_list[idx]] = value_list[idx]
    return results

In [53]:
docs[0]

'serializing a private struct can it be done i have a public class that contains a private struct the struct contains properties mostly string that i want to serialize when i attempt to serialize the struct and stream it to disk using xmlserializer i get an error saying only public types can be serialized i don t need and don t want this struct to be public is there a way i can serialize it and keep it private '

In [55]:
feature_names=cv.get_feature_names()
sorted_tuples = sort_COO(tfidf_vector_0.tocoo())
keywords = topn(sorted_tuples, feature_names, 10)

In [56]:
for k in keywords:
    print(k, keywords[k])

struct 0.6374968550012153
serialize 0.4424515787148237
private 0.26735989514215797
public 0.21354142007773283
contains 0.1897373423120556
xmlserializer 0.18560668932572483
serializing 0.18422279974733077
serialized 0.16646999213729438
disk 0.14643414879288635
don 0.14463781389819688


In [60]:
tfidf_vector_1 = tfidf_transformer.transform(cv.transform([docs[1]]))

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

cossim = cosine_similarity(tfidf_vector_0, tfidf_vector_1)

In [62]:
cossim

array([[0.01485378]])