In [95]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn import naive_bayes
from sklearn.metrics import confusion_matrix , classification_report
import nltk
from nltk.corpus import stopwords
from collections import Counter
import operator

In [24]:
base_dir = "/home/ash/MLCN/TextClassificationData/20_newsgroups/"

classes = os.listdir(base_dir)
print("Classes Name: ",classes)
print()
print("Total Documents: ",len(classes))

Classes Name:  ['rec.sport.hockey', 'sci.space', 'soc.religion.christian', 'talk.politics.mideast', 'sci.electronics', 'comp.graphics', 'rec.autos', 'rec.sport.baseball', 'sci.med', 'sci.crypt', 'talk.religion.misc', 'misc.forsale', 'comp.sys.mac.hardware', 'talk.politics.misc', 'talk.politics.guns', 'comp.sys.ibm.pc.hardware', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.windows.x', 'alt.atheism']

Total Documents:  20


# Calculating Tf-Idf matrix for each class :

In [36]:
def preprocess_files(file):
    text = file.read().lower() # read the file in lower cases
    
    text = re.sub('[^A-Za-z]', ' ', text) # remove non alphanumeric characters
    
    text = re.sub('\s+', ' ', text) # Condense all white spaces
    
    return text

In [21]:
nltk.download() # to download all packages of nltk

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [125]:
#Use small numbers for first time that is articles =  50, classes = 2 , features = 75 
max_articles_of_each_class = 5
no_of_classes = 2
no_of_features = 10

In [126]:

tf = {}
idf = {}
articles_added = {}
selected_classes  = []

for i in range(no_of_classes):
    current_class = classes[i]
    selected_classes.append(classes[i])
    
    articles_added[current_class] = []
    class_dir = base_dir + current_class # base_dir already contains '/' atlast therefore not added
    
    #print(class_dir)
    
    all_articles = os.listdir(class_dir)
    
    for j in range(max_articles_of_each_class):
        current_file = class_dir + '/' + all_articles[j] # class_dir doesnot contain '/, atlast
        
        #print(current_file)
        articles_added[current_class].append(all_articles[j])
        
        file= open(current_file, encoding = "ISO-8859-1")
        text = preprocess_files(file)
        file.close()
        text_words = text.split()
        
        # update tf dictionary
        word_count = Counter(text_words)
        
        for word,freq in word_count.items(): # items syntax stored as ele,fre Thats why word ,freq both mentioned
            if (word in tf):
                tf[word] += freq
            else :
                tf[word] = freq
        
        
        # updating idf
        word_set = set(text_words)
        
        for word in word_set:
            if word in idf:
                idf[word] += 1
            else :
                idf[word] = 1

In [127]:
# updating tf_idf matrix

tf_idf = {}
for key in tf.keys():
    tf_idf[key] = tf[key]/idf[key]
    
print(type(tf_idf))
print(tf_idf.keys())

<class 'dict'>
dict_keys(['newsgroups', 'rec', 'sport', 'hockey', 'path', 'cantaloupe', 'srv', 'cs', 'cmu', 'edu', 'crabapple', 'fs', 'ece', 'europa', 'eng', 'gtefsd', 'com', 'howland', 'reston', 'ans', 'net', 'wupost', 'eclnews', 'cec', 'jca', 'from', 'wustl', 'joseph', 'charles', 'achkar', 'subject', 'grant', 'fuhr', 'leads', 'sabres', 'message', 'id', 'apr', 'wuecl', 'sender', 'usenet', 'news', 'administrator', 'nntp', 'posting', 'host', 'organization', 'washington', 'university', 'st', 'louis', 'mo', 'date', 'wed', 'gmt', 'lines', 'buffalo', 'is', 'up', 'the', 'series', 'with', 'boston', 'and', 'reason', 'playoff', 'hungry', 'he', 's', 'proving', 'once', 'again', 'why', 'they', 'call', 'him', 'money', 'goaltender', 'might', 'not', 'be', 'one', 'of', 'best', 'goaltenders', 'in', 'league', 'anymore', 'statistically', 'at', 'least', 'but', 'that', 'can', 'make', 'big', 'save', 'right', 'time', 'leafs', 'should', 'have', 'kept', 'probably', 'would', 'had', 'a', 'chance', 'against', 'po

In [128]:
# StopWords

st = stopwords.words("english")
print(st,len(st))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Remove StopWords from our dictionary

In [129]:
print(len(tf_idf))

print('he' in tf_idf.keys())

for stop_word in stopwords.words("english"):
    if (stop_word in tf_idf.keys()):
        tf_idf.pop(stop_word)
        
print('he' in tf_idf.keys())
print(len(tf_idf))

1450
True
False
1351


# Selecting k features having max tf-idf values

In [130]:
print(type(tf_idf))
tf_idf = sorted(tf_idf.items(),key = operator.itemgetter(1))
tf_idf.reverse()

<class 'dict'>


In [131]:
# select k features
features = set()
count = {}
for i in range(no_of_features):
    features.add(tf_idf[i][0])
    count[tf_idf[i][0]] = tf_idf[i][1]

In [132]:
print(features)

{'edu', 'society', 'membership', 'groups', 'space', 'publishes', 'c', 'aerospace', 'year', 'box'}


In [134]:
# printing features with their tf-idf values

for word in features:
    print(word, count[word])

edu 8.6
society 11.0
membership 10.0
groups 10.0
space 17.2
publishes 10.0
c 30.0
aerospace 13.0
year 10.0
box 10.0


# Now we have got our features , we have to run through these features with whole data available (training and testing) 


To convert the files into a matrix format like the below one f: features , d: documents

        f1 f2 f3 f4 ...
     d1  1  1   2  3
     d2  3  3   4  3
     d3  1  2   3  4