In [31]:
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
from nltk.stem import PorterStemmer
import numpy as np
import nltk
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from numpy import asarray
from numpy import savetxt

nltk.download('names')
ps = PorterStemmer()
all_names = names.words()
WNL = WordNetLemmatizer()

path_train= "/home/antonis/Downloads/20news-bydate/20news-bydate-train"
path_test= "/home/antonis/Downloads/20news-bydate/20news-bydate-test"



def gather(path):
    df = pd.DataFrame()
    for file in os.listdir(path):
        tag = file
        for doc in os.listdir(path+'/'+file):
            docpath = path+'/'+file+'/'+doc
            f = open(docpath, "r",encoding='cp1252')
            content = f.read()
            temp = pd.DataFrame(
                {
                    'content':content,
                    'tag':tag
                },index=[0]
            )
            df = pd.concat([df, temp])



    df.content =df.content.replace(to_replace='From:(.*\n)', value='', regex=True) ##remove from to email
    df.content =df.content.replace(to_replace='lines:(.*\n)', value='', regex=True)
    df.content =df.content.replace(to_replace='Subject:(.*\n)', value='', regex=True)
    df.content =df.content.replace(to_replace='[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]', value=' ', regex=True) #remove punctuation except
    df.content =df.content.replace(to_replace='-', value=' ', regex=True)
    df.content =df.content.replace(to_replace='\s+', value=' ', regex=True)    #remove new line
    df.content =df.content.replace(to_replace='  ', value='', regex=True)                #remove double white space
    df.content =df.content.apply(lambda x:x.strip())  # Ltrim and Rtrim of whitespace

    df['content']=[entry.lower() for entry in df['content']]
    return df

df_news_train = gather(path_train)



def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha() and words not in all_names:
                words = ps.stem(words)
                cleaned[count].append(WNL.lemmatize(words.lower()))
        cleaned[count] = ' '.join(cleaned[count])
        count +=1
    return(list(cleaned.values()))

x_train = clean(df_news_train['content'])


tf = TfidfVectorizer(stop_words='english', max_features=8000,use_idf=True)
tfidf = tf.fit_transform(x_train)

vocab = tf.vocabulary_



[nltk_data] Downloading package names to /home/antonis/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english',vocabulary=vocab)

count_wm = countvectorizer.fit_transform(x_train)

#using Tfidftransformer and count vectorizer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(count_wm)


# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=countvectorizer.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

# count matrix 
count_vector=countvectorizer.transform(x_train) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = countvectorizer.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[10342] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=['tfidf'],ascending=False)

#using Tfidfvectorizer

# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf[0] 

# place tf-idf values in a pandas data frame
df_tfidf = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tf.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

def get_key(val):
    for key, value in tf.vocabulary_.items():
         if val == value:
             return key
 
    return "key doesn't exist"

In [33]:
#PART 2 ... TEST


def clean_test(data):
    cleaned = defaultdict(list)
    count = 0
    for words in data.split():
        if words.isalpha() and words not in all_names:
            words = ps.stem(words)
            cleaned[count].append(WNL.lemmatize(words.lower()))
    cleaned[count] = ' '.join(cleaned[count])
    count +=1
    return(list(cleaned.values()))

df_news_test = gather(path_test)

df_news_test['content'] = df_news_test['content'].apply(clean_test)

# x_test = clean(df_news_test['content'])



In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics



def cosine_similarity_test(document,tfidf):
    testvector = tf.transform(document)
    distances = cosine_similarity(testvector, tfidf)
    prediction = np.argmax(distances, 1)
    predicted = df_news_train['tag'].iloc[prediction]
    return predicted

#df=df_news_test
def classify(df):
    classified = pd.DataFrame()
    doc_count = len(df.index)
    correct = 0
    for row in df.itertuples():
        predicted = cosine_similarity_test(row[1],tfidf)
        temp = pd.DataFrame(
        {
            'document':row[1],
            'class':predicted
        },index=[0])
        df = pd.concat([df, temp])
        actual = row[2]
        if actual == predicted[0]:
            correct=correct + 1
    accuracy = correct/doc_count
    percentage = "{:.0%}".format(accuracy)
    print(percentage)
    return classified

classified_docs = classify(df_news_test)   
    


            
    


64%


In [43]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics

def classify(document,tfidf):
    testvector = tf.transform(document)
    distances = cosine_similarity(testvector, tfidf)
    prediction = np.argmax(distances, 1)
    predicted = df_news_train['tag'].iloc[prediction]
    return predicted

correct = 0
for row in df_news_test.itertuples():
    predicted = classify(row[1],tfidf)
    actual = row[2]
    if actual == predicted:
        correct = correct + 1

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
for row in df_news_test.itertuples():
    testvector = tf.transform(row[1])
    distances = cosine_similarity(testvector, tfidf)

KeyboardInterrupt: 