In [3]:
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
from nltk.stem import PorterStemmer
import numpy as np
import nltk
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from numpy import asarray
from numpy import savetxt

nltk.download('names')
ps = PorterStemmer()
all_names = names.words()
WNL = WordNetLemmatizer()

path_train= "/Users/antonis/Downloads/20news-bydate/20news-bydate-train"
path_test= "/Users/antonis/Downloads/20news-bydate/20news-bydate-test"


#gathering all the articles from 20news-bydate-train in a single dataframe
def gather(path): 
    df = pd.DataFrame() #this will be the final dataframe
    for file in os.listdir(path):
        tag = file
        for doc in os.listdir(path+'/'+file):
            docpath = path+'/'+file+'/'+doc
            f = open(docpath, "r",encoding='cp1252')
            content = f.read()
            temp = pd.DataFrame( #creating a temporary dataframe to store the current content and tag
                {
                    'content':content,
                    'tag':tag
                },index=[0]
            )
            df = pd.concat([df, temp]) #merge the temp dataframe with the final one



    df.content =df.content.replace(to_replace='From:(.*\n)', value='', regex=True) ##remove from to email
    df.content =df.content.replace(to_replace='lines:(.*\n)', value='', regex=True)
    df.content =df.content.replace(to_replace='Subject:(.*\n)', value='', regex=True)#remove subject
    df.content =df.content.replace(to_replace='[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]', value=' ', regex=True) #remove punctuation
    df.content =df.content.replace(to_replace='-', value=' ', regex=True)
    df.content =df.content.replace(to_replace='\s+', value=' ', regex=True)    #remove new line
    df.content =df.content.replace(to_replace='  ', value='', regex=True)                #remove double white space
    df.content =df.content.apply(lambda x:x.strip())  # ltrim and ltrim of whitespace

    df['content']=[entry.lower() for entry in df['content']] #to lowercase
    return df

df_news_train = gather(path_train)



def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha():
                words = ps.stem(words) #creating stems
                cleaned[count].append(words.lower())#to lowercase
        cleaned[count] = ' '.join(cleaned[count])
        count +=1
    print(cleaned)
    return(list(cleaned.values()))

x_train = clean(df_news_train['content'])


tf = TfidfVectorizer(stop_words='english', max_features=8000,use_idf=True) #Initializing the vectorizer.Setting 8000 features
tfidf = tf.fit_transform(x_train)#fit_transform learns vocabulary and idf, returns document-term matrix.

vocab = tf.vocabulary_



[nltk_data] Downloading package names to /Users/antonis/nltk_data...
[nltk_data]   Package names is already up-to-date!
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
from sklearn.feature_extraction.text import CountVectorizer

countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english',vocabulary=vocab)

count_wm = countvectorizer.fit_transform(x_train)

#using Tfidftransformer and count vectorizer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(count_wm)


# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=countvectorizer.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

# count matrix 
count_vector=countvectorizer.transform(x_train) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = countvectorizer.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[10342] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=['tfidf'],ascending=False)

#using Tfidfvectorizer

# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf[0] 

# place tf-idf values in a pandas data frame
df_tfidf = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tf.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

def get_key(val):
    for key, value in tf.vocabulary_.items():
         if val == value:
             return key
 
    return "key doesn't exist"

In [5]:
#PART 2 ... TEST

#perform the same process for the test data
def clean_test(data):
    cleaned = defaultdict(list)
    count = 0
    for words in data.split():
        if words.isalpha() and words not in all_names:
            words = ps.stem(words)
            cleaned[count].append(WNL.lemmatize(words.lower()))
    cleaned[count] = ' '.join(cleaned[count])
    count +=1
    return(list(cleaned.values()))

df_news_test = gather(path_test)

df_news_test['content'] = df_news_test['content'].apply(clean_test)

# x_test = clean(df_news_test['content'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import metrics
from sklearn.metrics import jaccard_score
import numpy
import sys



def similarity(document,tfidf,metric):
    testvector = tf.transform(document)#create vector for each document that needs to be classified
    if metric=='cosine':
        distances = cosine_similarity(testvector, tfidf)
    elif metric=='euclidean':
        distances = euclidean_distances(testvector, tfidf)
    else:
        sys.exit()
    prediction = np.argmax(distances, 1)#returns the index of the biggest value
    predicted = df_news_train['tag'].iloc[prediction]#locate the tag in the data
    return predicted

#df=df_news_test
def classify(df,metric):
    classified = pd.DataFrame()
    doc_count = len(df.index)
    correct = 0
    for row in df.itertuples():
        predicted = similarity(row[1],tfidf,metric)
        temp = pd.DataFrame(
        {
            'document':row[1],
            'class':predicted
        },index=[0])
        classified = pd.concat([classified, temp])
        actual = row[2]
        if actual == predicted[0]:
            correct=correct + 1
    accuracy = correct/doc_count
    print(correct)
    percentage = "{:.0%}".format(accuracy)
    print(percentage)
    return classified

classified = classify(df_news_test,'cosine')


[9395]
[510]
[150]
[201]
[476]
[7918]
[445]
[9395]
[236]
[428]
[356]
[535]
[62]
[8915]
[2696]
[11076]
[2082]
[713]
[11076]
[478]
[248]
[11076]
[535]
[374]
[287]
[420]
[408]
[93]
[248]
[163]
[60]
[4520]
[460]
[135]
[5473]
[250]
[251]
[537]
[173]
[11076]
[248]
[7065]
[163]
[447]
[236]
[2082]
[4555]
[213]
[2004]
[11076]
[236]
[212]
[1088]
[213]
[11076]
[365]
[517]
[11076]
[5414]
[11076]
[713]
[63]
[198]
[180]
[4924]
[199]
[58]
[529]
[230]
[510]
[2010]
[713]
[11224]
[4314]
[156]
[11076]
[325]
[7318]
[307]
[212]
[236]
[204]
[472]
[135]
[8809]
[517]
[268]
[199]
[269]
[7433]
[428]
[85]
[530]
[185]
[51]
[5517]
[2227]
[114]
[442]
[5773]
[253]
[9274]
[729]
[132]
[7046]
[66]
[10047]
[7046]
[70]
[10972]
[10054]
[537]
[3891]
[374]
[507]
[517]
[66]
[445]
[66]
[459]
[51]
[63]
[70]
[238]
[369]
[10249]
[128]
[502]
[231]
[2001]
[713]
[380]
[11076]
[4314]
[250]
[75]
[751]
[212]
[5571]
[60]
[236]
[11076]
[269]
[367]
[478]
[517]
[212]
[374]
[66]
[472]
[77]
[451]
[253]
[11076]
[128]
[496]
[10144]
[461]
[912

[2158]
[1954]
[11005]
[2092]
[1856]
[11058]
[2182]
[1885]
[2135]
[1960]
[1820]
[10075]
[2210]
[5143]
[1750]
[555]
[1820]
[9953]
[2082]
[1911]
[2193]
[11017]
[11080]
[9953]
[1971]
[1801]
[2058]
[1842]
[2163]
[7322]
[1842]
[9953]
[10740]
[2001]
[1750]
[6471]
[11139]
[2091]
[339]
[10182]
[1942]
[9935]
[11187]
[11080]
[1774]
[2119]
[4171]
[2001]
[11181]
[1814]
[1990]
[1757]
[2158]
[9932]
[1850]
[2091]
[11187]
[1920]
[10740]
[1904]
[1745]
[2001]
[2029]
[2010]
[1853]
[9963]
[2392]
[2076]
[1809]
[2021]
[2195]
[9891]
[11081]
[7410]
[1856]
[11211]
[4726]
[1887]
[2001]
[2186]
[1875]
[1801]
[4193]
[5255]
[2001]
[1745]
[2033]
[11021]
[1711]
[1782]
[2135]
[2095]
[2090]
[11202]
[2102]
[2074]
[2084]
[2076]
[2169]
[11164]
[10188]
[1942]
[11049]
[1842]
[2195]
[2133]
[460]
[1860]
[9949]
[7464]
[3315]
[2125]
[2189]
[1747]
[2001]
[2095]
[1942]
[2172]
[9963]
[2076]
[1736]
[2011]
[2001]
[2183]
[1960]
[2010]
[6664]
[2030]
[4279]
[9371]
[2172]
[9935]
[9891]
[11111]
[2001]
[5502]
[2092]
[1803]
[9481]
[1739]
[2

[3561]
[4000]
[3813]
[8548]
[3587]
[3984]
[4000]
[3437]
[3652]
[3915]
[3557]
[2699]
[3766]
[3408]
[411]
[3685]
[3587]
[3910]
[3502]
[4000]
[10370]
[3582]
[3968]
[8075]
[3616]
[3690]
[3522]
[3796]
[3676]
[3882]
[3579]
[8606]
[3885]
[6553]
[3467]
[3922]
[3533]
[4001]
[3802]
[3907]
[10486]
[3447]
[3838]
[3536]
[3910]
[2411]
[3702]
[4924]
[6287]
[3881]
[3882]
[3863]
[3882]
[3634]
[3553]
[3907]
[3438]
[3408]
[3678]
[3661]
[5530]
[3589]
[3984]
[3589]
[3844]
[3582]
[3629]
[2508]
[3764]
[7160]
[3622]
[3582]
[3796]
[3557]
[3663]
[3968]
[6689]
[3408]
[1597]
[3762]
[3573]
[3453]
[3932]
[3978]
[2600]
[3549]
[3582]
[3447]
[4917]
[3481]
[3430]
[3934]
[3553]
[3811]
[3531]
[3439]
[3923]
[3420]
[3418]
[3453]
[3450]
[3905]
[3984]
[2702]
[3844]
[3730]
[3880]
[3516]
[3769]
[3849]
[3435]
[3516]
[3663]
[3815]
[3731]
[3849]
[3521]
[3844]
[2550]
[3421]
[3582]
[3787]
[3838]
[3488]
[3584]
[3575]
[3860]
[3582]
[3676]
[3897]
[3772]
[3984]
[3552]
[3537]
[7903]
[3563]
[3755]
[6212]
[7898]
[3531]
[3454]
[3690]
[67]


[9106]
[9114]
[9121]
[5547]
[5584]
[1050]
[5492]
[5267]
[1791]
[8866]
[8750]
[5211]
[9121]
[6887]
[5645]
[5552]
[5194]
[2358]
[10581]
[5542]
[5444]
[5547]
[4856]
[5526]
[2001]
[5556]
[5622]
[5290]
[11285]
[8618]
[5356]
[10409]
[10985]
[5217]
[8730]
[5383]
[9114]
[3447]
[5578]
[5286]
[3448]
[5382]
[9114]
[8707]
[8284]
[5641]
[10439]
[5295]
[9020]
[5276]
[5276]
[8844]
[5286]
[5650]
[5344]
[4667]
[5295]
[5243]
[10967]
[9114]
[3675]
[9149]
[5449]
[8920]
[2358]
[8692]
[4475]
[5356]
[5286]
[5452]
[5228]
[5505]
[524]
[5290]
[5356]
[806]
[9114]
[8750]
[8707]
[2358]
[9114]
[10990]
[5295]
[5383]
[8957]
[5650]
[8863]
[5578]
[5289]
[11080]
[5286]
[5356]
[9106]
[5650]
[5295]
[11138]
[11243]
[5314]
[5547]
[4583]
[5295]
[8618]
[5248]
[4306]
[9114]
[5295]
[5324]
[5362]
[8707]
[5324]
[1590]
[5422]
[5655]
[8707]
[8707]
[9106]
[5622]
[5342]
[5500]
[5364]
[5650]
[4314]
[5596]
[8707]
[5559]
[5356]
[5584]
[8707]
[5573]
[5526]
[5329]
[4654]
[5362]
[3369]
[1001]
[8528]
[5650]
[5276]
[5317]
[5235]
[5586]
[5428

[6943]
[7338]
[6959]
[6296]
[6605]
[7552]
[7179]
[7330]
[7103]
[7332]
[6630]
[7418]
[7159]
[6317]
[7317]
[6521]
[6521]
[6922]
[1347]
[3863]
[6854]
[7046]
[8024]
[8264]
[1196]
[7239]
[10610]
[6324]
[7146]
[6897]
[7597]
[7337]
[6675]
[7239]
[6958]
[819]
[4292]
[1196]
[1385]
[7153]
[7133]
[1576]
[7084]
[9306]
[9256]
[4725]
[6573]
[1589]
[6993]
[6428]
[6296]
[11202]
[6897]
[6618]
[7389]
[7362]
[7039]
[7412]
[9058]
[9313]
[7199]
[7235]
[8264]
[2942]
[7015]
[7367]
[6521]
[3056]
[3388]
[7257]
[1711]
[7317]
[6675]
[7689]
[1628]
[7176]
[105]
[6959]
[6939]
[3175]
[9295]
[2780]
[7248]
[3313]
[7804]
[6892]
[7648]
[6959]
[11049]
[10180]
[7431]
[8264]
[6330]
[7003]
[8381]
[6593]
[6324]
[7431]
[7910]
[6938]
[6289]
[6557]
[6984]
[7379]
[6958]
[6509]
[9792]
[3261]
[10453]
[6959]
[7430]
[6896]
[4520]
[6915]
[6897]
[7143]
[7191]
[7053]
[6897]
[993]
[7412]
[7337]
[1690]
[7274]
[7330]
[7074]
[6444]
[7358]
[7396]
[3293]
[9381]
[6296]
[7097]
[7035]
[993]
[6868]
[247]
[9765]
[1511]
[7187]
[7337]
[7302]
[6934]

[8934]
[8993]
[266]
[370]
[8860]
[9050]
[8649]
[9086]
[8772]
[11180]
[8881]
[370]
[9077]
[4845]
[8920]
[1739]
[4314]
[8648]
[9050]
[11169]
[11056]
[5428]
[8840]
[8645]
[8812]
[67]
[8807]
[9063]
[8802]
[8845]
[8920]
[9042]
[8639]
[11174]
[9092]
[8748]
[9106]
[8855]
[8647]
[9804]
[5493]
[8812]
[8883]
[8934]
[9100]
[8623]
[8750]
[9880]
[8493]
[8771]
[6231]
[9114]
[8963]
[9043]
[9114]
[2257]
[3759]
[8958]
[8772]
[5413]
[8707]
[8944]
[7825]
[8635]
[8954]
[8798]
[9117]
[9089]
[8180]
[8814]
[8378]
[4314]
[8707]
[8920]
[8617]
[2303]
[8870]
[11110]
[8998]
[8968]
[8870]
[11169]
[4924]
[8618]
[8633]
[5308]
[8706]
[8867]
[9006]
[9129]
[8676]
[9106]
[8934]
[8627]
[8927]
[8858]
[5571]
[8985]
[11243]
[8633]
[8831]
[8869]
[8707]
[9011]
[8725]
[9059]
[9100]
[8750]
[5290]
[8985]
[5333]
[8692]
[8956]
[8961]
[9031]
[11180]
[10554]
[4067]
[9149]
[3387]
[9149]
[8842]
[8869]
[9029]
[8859]
[9114]
[8645]
[8829]
[9144]
[8870]
[8627]
[11097]
[8860]
[8920]
[10687]
[8885]
[1791]
[8733]
[564]
[9098]
[7666]
[8629]
[

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import metrics
from sklearn.metrics import jaccard_score
import sklearn

import numpy



def euclidean_similarity_test(document,tfidf):
    testvector = tf.transform(document)
    distances = sklearn.metrics.pairwise.polynomial_kernel(testvector, tfidf)
    prediction = np.argmax(distances, 1)
    predicted = df_news_train['tag'].iloc[prediction]
    return predicted

#df=df_news_test
def classify(df):
    classified = pd.DataFrame()
    doc_count = len(df.index)
    correct = 0
    for row in df.itertuples():
        predicted = euclidean_similarity_test(row[1],tfidf)
        temp = pd.DataFrame(
        {
            'document':row[1],
            'class':predicted
        },index=[0])
        classified = pd.concat([classified, temp])
        actual = row[2]
        if actual == predicted[0]:
            correct=correct + 1
    accuracy = correct/doc_count
    percentage = "{:.0%}".format(accuracy)
    print(percentage)
    return classified

# classified_docs = classify(df_news_test)   
    
# def jaccard_similarity_test(document,tfidf):

testvector = tf.transform(df_news_test['content'].iloc[560])

distance = sklearn.metrics.pairwise.rb_kernel(testvector,tfidf)
print(distance)
prediction = np.argmax(distance, 1)
print(prediction)
            

AttributeError: module 'sklearn.metrics.pairwise' has no attribute 'rb_kernel'

In [None]:
import dicttoxml

In [9]:
print(tfidf.shape)

(11314, 8000)
