In [13]:
import pandas as pd
import numpy as np
import gensim
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn import preprocessing
import nltk
import re
from sklearn.model_selection import GridSearchCV
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report,accuracy_score
import itertools
from progressbar import ProgressBar

In [2]:
def myTokenizer(text):
    return nltk.regexp_tokenize(text, "\\b[a-zA-Z]{3,}\\b")
def tokenizeContents(contents):
    regex = r'\b\w+\b'
    li=re.findall(regex,contents)
    return [myTokenizer(content) for content in li]
def preprocess():
    model = gensim.models.KeyedVectors.load_word2vec_format('/home/student1/GoogleNews-vectors-negative300.bin.gz', binary=True)
    twenty_all = fetch_20newsgroups(subset='all', shuffle=True,remove=('headers', 'footers', 'quotes'))
    full1=[" ".join(data.split("\n")) for data in twenty_all.data]
    stopWords = set(stopwords.words('english'))
    full= []
    tmp=[] 
    for d in range(len(full1)):
        words = word_tokenize(full1[d])
        for w in words:
            if w not in stopWords:
                tmp.append(w)
        x=" ".join(tmp)
        tmp=myTokenizer(x)
        x=" ".join(tmp)
        tmp=[]
        full.append(x)
    y=twenty_all.target
    X,y=readydocs(full,y,model)
    return X,y
def readydocs(full,y,model):
    add=np.zeros(300,dtype='float32')
    doc=np.zeros((18846,300))
    l=[]#for empty documents
    count=0 #total times the key error is raised
    c_names=[] #contains words that are in documents but they don't have vector representations
    count_word_exists=0 #checks number of words having vector representations in the current document
    del_index=[] # has index of the documnets where no word has a vector representation

    rate=[] #hit rate for words having vector representations in a document
    for q1 in range(len(full)):
        token=tokenizeContents(full[q1])
        token = list(filter(None, token))
        if len(token)==0:
            l.append(q1)
            rate.append(0)
            doc[q1]=np.zeros(300,dtype='float32')
            continue
        for q in range(len(token)):
            try:
                add=add+model.word_vec("".join(token[q]))
                count_word_exists=count_word_exists+1

            except KeyError:
                count+=1
                if "".join(token[q]) not in c_names:
                    c_names.append("".join(token[q]))
                continue
    #print(add,len(token))
        if count_word_exists==0:
            del_index.append(q1)
            add=np.zeros(300,dtype='float32')
            doc[q1]=add
            rate.append(0)
            continue
        rate.append(count_word_exists/len(token))
        add=add/count_word_exists
        doc[q1]=add
        add=np.zeros(300,dtype='float32')
        count_word_exists=0
    doc=[ value for (i, value) in enumerate(doc) if i not in set(l+del_index) ]
    doc=np.asarray(doc)
    X,y=delundocs(full,y,l,del_index)###delete unnecessary docs
    return X,y
def delundocs(full,y,l,del_index):
    #y=twenty_all.target                                 ##############deleting docs for all methods
    full=[ value for (i, value) in enumerate(full) if i not in set(l+del_index) ]
    full_f=full
    full=np.asarray(full)
    for i in sorted(set(l+del_index),reverse=True):
        y=np.delete(y,i)
    return full_f,y

In [3]:
def ftext(k,X,y,train_index,test_index,dim=100,epoch=50,ws=5,lr=0.1):
    ##################              Fast text
    cmd1="fasttext supervised " ###############################################start
    cmd1=cmd1+' -dim '+str(dim)+' -epoch '+str(epoch)+' -ws '+str(ws)+' -lr '+str(lr)+" -input train20.txt -output model20 >> ftrainresult.txt"
    #print (cmd1)
    #return
    cmd2="fasttext test model20.bin test20.txt >> ftestresult.txt"
    cmd3="fasttext predict model20.bin test20.txt > result20.txt"
    c=0
    outF = open("train20.txt", "w")
    out1 = open("test20.txt", "w")
    for i in range(len(train_index)):
        outF.write("__label__")
        outF.write(str(y[train_index[i]]))
        outF.write(" ")
        try :
            outF.write(X[train_index[i]])
            outF.write("\n")
        except UnicodeEncodeError:
            print("i",i)
            outF.write("\n")
            continue
    os.system(cmd1)
    #print(len(test_index))
    c=0
    for j in range(len(test_index)):
        out1.write("__label__")
        out1.write(str(y[test_index[j]]))
        out1.write(" ")
        c=c+1
        try :
            out1.write(X[test_index[j]])
            out1.write("\n")
        except UnicodeEncodeError:
            print("j",j)
            out1.write("\n")
            continue
    outF.close()
    out1.close()
    os.system(cmd2)
    os.system(cmd3)
    f = open('result20.txt', 'r')
    lines = np.asarray([np.int32("".join(re.findall('\d+', line))) for line in f.readlines()])
    f.close()
    #self.trupresav("fasttext"+str(k),lines,test_index)
    #self.report=classification_report(self.y[test_index],lines,target_names=self.twenty_all.target_names)
    #self.reporttocsv('fasttext',i=k)
    #print("Accuracy score :",round(accuracy_score(y[test_index],lines),3))
    return round(accuracy_score(y[test_index],lines),3)
    

In [4]:
dim_val=[10,25,50,100,200,250,300]
epoch_val=[5,20,25,50,100]
ws_val=[5,10]
lr_val=[0.05,0.025,0.075,0.1,0.150,0.175]
p_grid=dict(dim=dim_val,epoch=epoch_val,ws=ws_val,lr=lr_val)


In [5]:
CP=[dict(zip(p_grid, x)) for x in itertools.product(*p_grid.values())]

In [6]:
X,y=preprocess()

In [7]:
skf = StratifiedKFold(n_splits=3,shuffle=True,random_state=1)

In [8]:
pbar=ProgressBar()
ac_list=[]
for x in pbar(CP):
    for k,(train_index, test_index) in enumerate(skf.split(X, y)):
        ac_list.append(ftext(k,X,y,train_index,test_index,x['dim'],x['epoch'],x['ws'],x['lr']))
    

100% (420 of 420) |#######################| Elapsed Time: 3:37:31 Time: 3:37:31


In [9]:
import statistics as st
ac_conlis=[]
for i in range(0,len(ac_list),3):
    ac_conlis.append(st.mean([ac_list[i],ac_list[i+1],ac_list[i+2]]))

In [10]:
index=np.argsort(ac_conlis)[::-1][:50]

In [11]:
CP=np.asarray(CP)
ac_conlis=np.asarray(ac_conlis)
for v in index:
    print('parameters:',CP[v],'-------------------->>>>score :',round(ac_conlis[v],3))

parameters: {'dim': 50, 'epoch': 100, 'ws': 5, 'lr': 0.075} -------------------->>>>score : 0.705
parameters: {'dim': 50, 'epoch': 100, 'ws': 10, 'lr': 0.05} -------------------->>>>score : 0.704
parameters: {'dim': 100, 'epoch': 100, 'ws': 10, 'lr': 0.1} -------------------->>>>score : 0.704
parameters: {'dim': 50, 'epoch': 100, 'ws': 10, 'lr': 0.075} -------------------->>>>score : 0.704
parameters: {'dim': 50, 'epoch': 100, 'ws': 5, 'lr': 0.05} -------------------->>>>score : 0.704
parameters: {'dim': 50, 'epoch': 100, 'ws': 10, 'lr': 0.1} -------------------->>>>score : 0.704
parameters: {'dim': 50, 'epoch': 50, 'ws': 5, 'lr': 0.1} -------------------->>>>score : 0.704
parameters: {'dim': 50, 'epoch': 50, 'ws': 5, 'lr': 0.175} -------------------->>>>score : 0.703
parameters: {'dim': 50, 'epoch': 50, 'ws': 5, 'lr': 0.15} -------------------->>>>score : 0.703
parameters: {'dim': 100, 'epoch': 50, 'ws': 5, 'lr': 0.15} -------------------->>>>score : 0.703
parameters: {'dim': 25, 'epo