# Imports

In [37]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline


# Split dataset to train and test

In [38]:
categories = ['sci.space', 'comp.graphics', 'talk.politics.misc', 'rec.sport.hockey', 'comp.sys.mac.hardware'] 
remove = ('headers', 'footers', 'quotes')
# twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
# twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )
train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )



# Train Profiles

## Create single train documents object with docs and targets

In [39]:
documents = tuple(zip(train.data, train.target))

In [40]:
# for doc in documents:
#     print(doc[1])

## Vectorise train data to: split docs by words, remove stop words.

In [41]:
vect = CountVectorizer(stop_words = 'english')
vect.fit(train.data)
# train_data = vect.transform(train.data)
# test_data = vect.transform(test.data)
# vectores.append((train_data, test_data))

CountVectorizer(stop_words='english')

## Create general words list

In [42]:
words_list = vect.get_feature_names()
print(words_list[:5000])


['00', '000', '0000', '00000', '000000', '000005102000', '000062david42', '000100255pixel', '00041032', '0004136', '0004246', '0004422', '00044513', '0004847546', '0005', '0007', '00090711', '000k', '000usd', '0012', '001200201pixel', '0018', '00196', '0020', '0022', '0028', '0029', '0033', '0034', '0038', '0049', '005', '006', '0065', '0078', '0094', '0098', '00_', '00index', '00pm', '01', '0100', '01075', '011', '013', '013846', '013939', '014', '01752', '0179', '01801', '01821', '01826', '0184', '01852', '01854', '01890', '018b', '0199', '01a', '02', '020', '0200', '020359', '020637', '021', '02115', '02138', '02139', '02154', '02178', '0223', '0235', '023b', '0245', '0283', '03', '030', '0300', '0300ff', '03051', '0330', '034', '034101', '0358', '036', '037', '038', '04', '040', '0400', '040286', '041', '0410', '04110', '041493003715', '042', '0430', '0434', '0435', '045', '045651', '0458', '0483', '0486', '0488', '04g', '05', '050', '0500', '051', '0511', '053', '05402', '05446', 

## For each word in train set calculete number of documents

In [43]:
%%time

words_list_count = {}
for doc in documents: 
    for word in words_list:
        if word in doc[0]:
            if word in words_list_count:
                if doc[1] in words_list_count[word]:
                    words_list_count[word][doc[1]] += 1
                else:
                    words_list_count[word][doc[1]] = 1
            else:
                words_list_count[word] = {}
                words_list_count[word][doc[1]] = 1
                
            

Wall time: 2min 36s


In [44]:
# example of calulation result
words_list_count['ability']

{3: 33, 2: 15, 4: 19, 1: 18, 0: 15}

## Calculate total number of docs for each class

In [45]:
total_docs_count = {}
for doc in documents:
    if doc[1] in total_docs_count:
        total_docs_count[doc[1]] += 1
    else:
        total_docs_count[doc[1]] = 1

In [46]:
total_docs_count.items()

dict_items([(3, 593), (2, 600), (4, 465), (1, 578), (0, 584)])

## Build Jaccard Profiles 

In [47]:
%%time

def build_profiles(profile_length, classes, words_list_count, total_docs_count):
    classes = classes
    profiles = {}
    for p_class in classes: 
        k_profile = {}
        for word in words_list_count: 
            if p_class in words_list_count[word]:
                a = words_list_count[word][p_class]
                word_classes = words_list_count[word]
                del word_classes[p_class]
                b = 0
                for item in word_classes:
                    b += word_classes[item]
                c = total_docs_count[p_class] - a
                word_jac = a/(a+b+c)
                if word == 'ability':
                    print (word, '-', word_jac, '-', a,'-',b,'-',c, '--',p_class)
                if word not in k_profile:
                    k_profile[word] = word_jac
                else:
                    print('This word already exists in the profile')
        k_profile = sorted(k_profile.items(), key=lambda item: item[1], reverse=True)
        k_profile = k_profile[:profile_length]
        profiles[p_class] = dict(k_profile)
    return profiles
    

Wall time: 0 ns


In [48]:
%%time
profiles = build_profiles(profile_length=100, classes=[0, 1, 2, 3, 4], words_list_count=words_list_count, total_docs_count=total_docs_count)

ability - 0.02242152466367713 - 15 - 85 - 569 -- 0
ability - 0.027906976744186046 - 18 - 67 - 560 -- 1
ability - 0.023006134969325152 - 15 - 52 - 585 -- 2
ability - 0.05392156862745098 - 33 - 19 - 560 -- 3
ability - 0.04086021505376344 - 19 - 0 - 446 -- 4
Wall time: 111 ms


In [49]:
profiles.keys()

dict_keys([0, 1, 2, 3, 4])

In [50]:
profiles[1]

{'er': 0.2506047411707789,
 'th': 0.2502351834430856,
 'ne': 0.24740010946907498,
 'ha': 0.24370594159113795,
 'le': 0.2431854623196152,
 'ac': 0.2427307206068268,
 've': 0.2368839427662957,
 'wit': 0.23657587548638132,
 'wi': 0.23452768729641693,
 'es': 0.23259412068076327,
 'ith': 0.2315160567587752,
 'nd': 0.23045054375971,
 'te': 0.2299687825182102,
 'ou': 0.22953289804118535,
 'se': 0.22905620360551432,
 'ri': 0.2289639381797367,
 'fo': 0.22812311406155703,
 'pp': 0.22746419545071608,
 'ar': 0.227012987012987,
 'ch': 0.22660098522167488,
 'ca': 0.2263575350823673,
 'om': 0.22575057736720555,
 'ard': 0.22431668237511782,
 'ed': 0.22425509670674335,
 'nt': 0.2236126224156692,
 'al': 0.2233211868818324,
 'rd': 0.2222222222222222,
 'hi': 0.2217438105489774,
 'ma': 0.2216117216117216,
 'em': 0.22103148024112526,
 'pr': 0.22083879423328964,
 'ng': 0.22068612391193038,
 'st': 0.2206405693950178,
 'ot': 0.2201514269073966,
 'ing': 0.22006302521008403,
 'hat': 0.2188235294117647,
 'ut': 0.

# Test Profiles

## Vectorize each document

In [51]:
test_docs_targets = dict(zip(test.data, test.target))


In [52]:
test.data[0]

"\nIt's not quite what you were asking, but a few years ago I helped some EE\nremote sensing people run some experiments on the microwave emmissivity of\nice; they used the sky for a background calibration source.  They said that\nfrom Earth's surface the sky looks like a 60K blackbody.\n"

In [53]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf.fit(test.data)
doc_tfidf = tfidf.transform(test.data)
tfidf_list = dict(zip(tfidf.get_feature_names(), np.ravel(doc_tfidf.sum(axis=0))))



    
    

## Predict class for each doc

%%time

test_prdiction = []
for doc in test.data:
    w_doc = [0,0]
    for profile, jaccard_data in profiles.items(): 
        p_w = 0
        for word, tfidff_score in tfidf_list.items():
            if word in doc:
                if word in jaccard_data:
                    p_w += tfidff_score * jaccard_data[word]
        if p_w > w_doc[1]:
            w_doc[1] = p_w
            w_doc[0] = profile
    test_prdiction.append(w_doc)

test_prdiction