# Imports

In [315]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, jaccard_similarity_score
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline


# Split dataset to train and test

In [316]:
categories = ['sci.space', 'comp.graphics', 'talk.politics.misc', 'rec.sport.hockey', 'comp.sys.mac.hardware'] 
remove = ('headers', 'footers', 'quotes')
# twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
# twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )
train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )



# Train Profiles

## Create single train documents object with docs and targets

In [317]:
documents = tuple(zip(train.data, train.target))

In [318]:
# for doc in documents:
#     print(doc[1])

## Vectorise train data to: split docs by words, remove stop words.

In [319]:
vect = CountVectorizer(stop_words = 'english')
vect.fit(train.data)
# train_data = vect.transform(train.data)
# test_data = vect.transform(test.data)
# vectores.append((train_data, test_data))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Create general words list

In [341]:
words_list = vect.get_feature_names()
print(words_list[:5000])


['00', '000', '0000', '00000', '000000', '000005102000', '000062david42', '000100255pixel', '00041032', '0004136', '0004246', '0004422', '00044513', '0004847546', '0005', '0007', '00090711', '000k', '000usd', '0012', '001200201pixel', '0018', '00196', '0020', '0022', '0028', '0029', '0033', '0034', '0038', '0049', '005', '006', '0065', '0078', '0094', '0098', '00_', '00index', '00pm', '01', '0100', '01075', '011', '013', '013846', '013939', '014', '01752', '0179', '01801', '01821', '01826', '0184', '01852', '01854', '01890', '018b', '0199', '01a', '02', '020', '0200', '020359', '020637', '021', '02115', '02138', '02139', '02154', '02178', '0223', '0235', '023b', '0245', '0283', '03', '030', '0300', '0300ff', '03051', '0330', '034', '034101', '0358', '036', '037', '038', '04', '040', '0400', '040286', '041', '0410', '04110', '041493003715', '042', '0430', '0434', '0435', '045', '045651', '0458', '0483', '0486', '0488', '04g', '05', '050', '0500', '051', '0511', '053', '05402', '05446', 

## For each word in train set calculete number of documents

In [321]:
%%time

words_list_count = {}
for doc in documents: 
    for word in words_list:
        if word in doc[0]:
            if word in words_list_count:
                if doc[1] in words_list_count[word]:
                    words_list_count[word][doc[1]] += 1
                else:
                    words_list_count[word][doc[1]] = 1
            else:
                words_list_count[word] = {}
                words_list_count[word][doc[1]] = 1
                
            

CPU times: user 1min 17s, sys: 324 ms, total: 1min 17s
Wall time: 1min 18s


In [322]:
# example of calulation result
words_list_count['mac']

{0: 38, 1: 108, 2: 11, 3: 21, 4: 14}

## Calculate total number of docs for each class

In [323]:
total_docs_count = {}
for doc in documents:
    if doc[1] in total_docs_count:
        total_docs_count[doc[1]] += 1
    else:
        total_docs_count[doc[1]] = 1

In [324]:
total_docs_count.items()

dict_items([(3, 593), (2, 600), (4, 465), (1, 578), (0, 584)])

## Build Jaccard Profiles 

In [325]:
%%time

def build_profiles(profile_length, classes, words_list_count, total_docs_count):
    classes = classes
    profiles = {}
    for p_class in classes: 
        k_profile = {}
        for word in words_list_count: 
            if p_class in words_list_count[word]:
                a = words_list_count[word][p_class]
                word_classes = words_list_count[word]
                del word_classes[p_class]
                b = 0
                for item in word_classes:
                    b += word_classes[item]
                c = total_docs_count[p_class] - a
                word_jac = a/(a+b+c)
                if word not in k_profile:
                    k_profile[word] = word_jac
                else:
                    print('This word already exists in the profile')
        k_profile = sorted(k_profile.items(), key=lambda item: item[1], reverse=True)
        k_profile = k_profile[:profile_length]
        profiles[p_class] = dict(k_profile)
    return profiles
    

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 10 µs


In [326]:
%%time
profiles = build_profiles(profile_length=100, classes=[0, 1, 2, 3, 4], words_list_count=words_list_count, total_docs_count=total_docs_count)

CPU times: user 125 ms, sys: 4.75 ms, total: 130 ms
Wall time: 132 ms


In [327]:
profiles.keys()

dict_keys([0, 1, 2, 3, 4])

In [340]:
profiles[0]

{'ab': 0.15694282380396732,
 'ac': 0.15973630831643001,
 'age': 0.17193308550185873,
 'ail': 0.15836653386454183,
 'al': 0.1693548387096774,
 'ap': 0.20590207914151576,
 'ar': 0.1858108108108108,
 'ati': 0.1654533421575116,
 'av': 0.1620879120879121,
 'ca': 0.1775793650793651,
 'ce': 0.17344439000489956,
 'ch': 0.16516516516516516,
 'cs': 0.19047619047619047,
 'ct': 0.17546948356807512,
 'ea': 0.15928038613426942,
 'ec': 0.1812926957435628,
 'ed': 0.1848381601362862,
 'ee': 0.15364982133741706,
 'el': 0.1643835616438356,
 'en': 0.17343946376204442,
 'er': 0.20030876109610188,
 'ere': 0.19207484865162355,
 'es': 0.2028380634390651,
 'et': 0.16594307766521948,
 'fi': 0.1761786600496278,
 'file': 0.20249221183800623,
 'fo': 0.2048015678588927,
 'form': 0.17420814479638008,
 'format': 0.1590594744121715,
 'ge': 0.17162966973470492,
 'gr': 0.21428571428571427,
 'gram': 0.1830238726790451,
 'graph': 0.1864406779661017,
 'graphi': 0.18048780487804877,
 'graphic': 0.18107667210440456,
 'graphi

# Test Profiles

## Vectorize each document

In [329]:
test_docs_targets = dict(zip(test.data, test.target))


In [330]:
test.data[0]

"\nIt's not quite what you were asking, but a few years ago I helped some EE\nremote sensing people run some experiments on the microwave emmissivity of\nice; they used the sky for a background calibration source.  They said that\nfrom Earth's surface the sky looks like a 60K blackbody.\n"

In [331]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf.fit(test.data)
doc_tfidf = tfidf.transform(test.data)
tfidf_list = dict(zip(tfidf.get_feature_names(), np.ravel(doc_tfidf.sum(axis=0))))



    
    

## Predict class for each doc

In [332]:
%%time

test_prdiction = []
for doc in test.data:
    w_doc = [0,0]
    for profile, jaccard_data in profiles.items(): 
        p_w = 0
        for word, tfidff_score in tfidf_list.items():
            if word in doc:
                if word in jaccard_data:
                    p_w += tfidff_score * jaccard_data[word]
        if p_w > w_doc[1]:
            w_doc[1] = p_w
            w_doc[0] = profile
    test_prdiction.append(w_doc)

CPU times: user 3min 29s, sys: 450 ms, total: 3min 30s
Wall time: 3min 30s


In [333]:
test_prdiction

[[4, 42.002530770956696],
 [4, 65.749771843825911],
 [4, 25.032166612923724],
 [4, 67.25355434501148],
 [4, 44.02419818713512],
 [4, 58.075024391474308],
 [4, 49.847562329447015],
 [4, 66.652333059716383],
 [4, 47.602364340565487],
 [4, 17.930370646852957],
 [4, 55.830669455022196],
 [4, 32.684041571347777],
 [4, 11.785727556159426],
 [4, 46.69899068451415],
 [4, 0.39163383505680455],
 [4, 35.620216783351594],
 [4, 49.599221443858077],
 [4, 63.548097653286128],
 [4, 64.112211942131651],
 [4, 23.415458343707218],
 [4, 55.194750578086236],
 [4, 57.446277515448074],
 [4, 64.629084874707701],
 [4, 64.022351065548079],
 [4, 59.500624510370919],
 [4, 61.172946164789195],
 [4, 27.610080757927211],
 [4, 39.596614114802328],
 [4, 59.946096510152685],
 [4, 34.619800441918528],
 [4, 28.487771412221168],
 [4, 63.84279665143746],
 [4, 45.422531338087893],
 [4, 68.06053753490265],
 [4, 67.68267237367759],
 [4, 63.968688629846902],
 [4, 17.083961488729081],
 [4, 60.360538930808282],
 [4, 53.745576935