<a href="https://colab.research.google.com/github/Carlone92/Text-Classification-Reuters21578/blob/master/Perceptron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from nltk import word_tokenize
from nltk.corpus import reuters
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
import nltk
import time
from scipy import sparse
import numpy as np
import matplotlib.pyplot as plt



def init():
  nltk.download('reuters')
  nltk.download('stopwords')
  nltk.download('punkt')
  
def extractVocabulary(text):
  vocabulary = set(tokenize(text))
  return vocabulary

def tokenize(text):
  min_length = 3
  cachedStopWords = stopwords.words("english")
  words = map(lambda word: word.lower(), word_tokenize(text))
  words = [word for word in words if word not in cachedStopWords]
  tokens =(list(map(lambda token: PorterStemmer().stem(token), words)))
  p = re.compile('[a-zA-Z]+')
  filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens))
  return filtered_tokens


def calc_r(tfidf,train_docs):
    return max([sparse.linalg.norm(tfidf[d]) for doc in train_docs])
  

  
def tf(docs):
    tf={}
    for doc in docs:
        words=tokenize(reuters.raw(doc))
        doc_voc=set(words)
        for w in doc_voc:
            tf[(doc,w)]=words.count(w)/len(words)
    return tf

def idf(docs,sorted_voc):
    idf=np.zeros(len(sorted_voc))
    words=list()
    for doc in docs:
        word_in_doc=extractVocabulary(reuters.raw(doc))
        words.extend(word_in_doc)
    for word in sorted_voc:
        idf[sorted_voc.index(word)]=math.log(len(docs))-math.log(1+words.count(word))           
    return idf

def tfidfSparse(tf,idf,doc, vocabulary):
  tfidf=np.zeros(len(vocabulary))
  for word in vocabulary:
    if tf.get((doc,word),0)!=0:
      tfidf[vocabulary.index(word)]=tf.get((doc,word))*idf[vocabulary.index(word)]
  return sparse.csr_matrix(tfidf)

def tfidf_noSparse(tf,idf,doc, vocabulary):
  tfidf=np.zeros(len(vocabulary))
  for word in vocabulary:
    if tf.get((doc,word),0)!=0:
      tfidf[vocabulary.index(word)]=tf.get((doc,word))*idf[vocabulary.index(word)]
  return tfidf

def label(d,docs_in_class):
  if d in docs_in_class:
    y=1
  else:
    y=-1
  return y



In [126]:
#@title Default title text
init()
categories='acq','earn','money-fx','grain','crude','trade'
localtime = time.asctime( time.localtime(time.time()) )
print(localtime)
train_docs = [doc for c in categories for doc in reuters.fileids(c) if doc.startswith("train")]
test_docs = [doc for c in categories for doc in reuters.fileids(c) if doc.startswith("test")]

test_docs_in_class={cat:set(filter(lambda doc:doc in reuters.fileids(cat),test_docs)) for cat in categories}
docs_in_class={cat:set(filter(lambda doc: doc in reuters.fileids(cat),train_docs)) for cat in categories}

vocabulary=sorted(extractVocabulary(reuters.raw(train_docs)+' '))
vocabularyTest=sorted(extractVocabulary(reuters.raw(test_docs)+' '))
vocAll=sorted(vocabulary+vocabularyTest)

localtime = time.asctime( time.localtime(time.time()) )
print(localtime)
  

print("TRAIN documents:" ,len(train_docs))
print("TEST documents:" ,len(test_docs))
print("Words in vocabulary: ", len(vocabulary))

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Fri Jan 11 15:09:00 2019
Fri Jan 11 15:10:05 2019
TRAIN documents: 6255
TEST documents: 2440
Words in vocabulary:  17976


In [127]:
localtime = time.asctime( time.localtime(time.time()) )
print('Time Start Tf',localtime)
tfTrain=tf(train_docs)

localtime = time.asctime( time.localtime(time.time()) )
print('End Tf & Start Idf',localtime)

idfTrain=idf(train_docs,vocabulary)
localtime = time.asctime( time.localtime(time.time()) )
print('End Idf',localtime)


Time Start Tf Fri Jan 11 15:10:05 2019
End Tf & Start Idf Fri Jan 11 15:10:37 2019
End Idf Fri Jan 11 15:12:50 2019


In [128]:
localtime = time.asctime( time.localtime(time.time()) )
print('Start Tf_Idf for all documnets',localtime)

tfIdfTrain={}
for d in train_docs:
  tfIdfTrain[d]=tfidfSparse(tfTrain,idfTrain,d,vocabulary)

localtime = time.asctime( time.localtime(time.time()) )
print('End Tf_Idf for all documnets',localtime)

Start Tf_Idf for all documnets Fri Jan 11 15:12:50 2019
End Tf_Idf for all documnets Fri Jan 11 15:16:27 2019


In [129]:
r=calc_r(tfIdfTrain,train_docs)
print(r)

0.47788808044290526


In [0]:

def train(train_docs,docs_in_class,tfidf,rquadrato,n_iter):
  
  weight=np.zeros(len(vocabulary)+1)
  epochs=0
  bias=0
  finish=False
  while not finish and epochs < n_iter:
    error=0
    for d in train_docs:
      y=label(d,docs_in_class)
      if y*(tfidf[d].dot(weight[1:])+weight[0])<= 0:
        weight[1:]=weight[1:]+(y*tfidf[d])
        weight[0]=weight[0]+(y*rquadrato)
        error+=1
        
    if error==0:
      finish=True
    epochs+=1
  return weight[1:],weight[0],epochs



In [131]:
weights={}
epochs={}
bias={}

localtime = time.asctime( time.localtime(time.time()) )
print(localtime)

for cat in categories:
  weights[cat],bias[cat],epochs[cat]=train(train_docs,docs_in_class[cat],tfIdfTrain,r**2,1000)
  
  localtime = time.asctime( time.localtime(time.time()) )

  print('Categoria',cat,weights[cat])
  print('Bias',bias[cat])
  print('Epoche', epochs[cat],localtime)
  
  

Fri Jan 11 15:16:28 2019
Categoria acq [0.         0.         0.23417205 ... 0.         0.         0.        ]
Bias -0.22837701742940464
Epoche 1000 Fri Jan 11 15:17:47 2019
Categoria earn [ 0.          0.         -0.02128837 ... -0.01845869  0.
  0.        ]
Bias -5.551115123125783e-17
Epoche 1000 Fri Jan 11 15:19:07 2019
Categoria money-fx [ 0.          0.          0.         ... -0.01845869  0.
  0.        ]
Bias -0.2283770174294047
Epoche 1000 Fri Jan 11 15:20:27 2019
Categoria grain [0.         0.         0.         ... 0.03691738 0.         0.        ]
Bias -0.2283770174294047
Epoche 1000 Fri Jan 11 15:21:46 2019
Categoria crude [0.        0.        0.0638651 ... 0.        0.        0.       ]
Bias -5.551115123125783e-17
Epoche 1000 Fri Jan 11 15:23:05 2019
Categoria trade [0. 0. 0. ... 0. 0. 0.]
Bias 0.22837701742940464
Epoche 1000 Fri Jan 11 15:24:24 2019


In [0]:
localtime = time.asctime( time.localtime(time.time()) )
print('Time Start Tf',localtime)
tfTest=tf(test_docs)

localtime = time.asctime( time.localtime(time.time()) )
print('End Tf & Start Idf',localtime)

idfTest=idf(test_docs,vocabulary)
localtime = time.asctime( time.localtime(time.time()) )
print('End Idf',localtime)

Time Start Tf Fri Jan 11 15:24:24 2019
End Tf & Start Idf Fri Jan 11 15:24:36 2019


In [0]:
tfIdfTest={}
for d in test_docs:
  tfIdfTest[d]=tfidfSparse(tfTest,idfTest,d,vocabulary)

In [0]:

def test(docs,weights,bias,tfidf):
  scores=np.zeros(len(docs))
  pesi={}
  for d in docs:
    np.put(scores,docs.index(d),(tfidf[d].dot(weights)+bias),'raise')
    pesi[d]=(tfidf[d].dot(weights)+bias)
  return scores,pesi

def soglia(score1):
  score=sorted(score1)
  soglia=np.zeros(len(score)+1)
  i=0
  j=2
  while i!=len(score) and j!=len(score)+1:
    a=np.median(score[i:j])
    np.put(soglia,j-1,a,'raise')
    i+=1
    j+=1
  soglia[0]=np.min(score)
  soglia[-1]=np.max(score)
  return (soglia)

def calcoloTf(soglia,score,verita):
  Tn={}
  Tp={}
  Fp={}
  Fn={}
  for s in soglia:
    tp_count=tn_count=fn_count=fp_count=0
    for doc in score :
      if score[doc] >= s:
        if doc in verita:
          tp_count+=1
        elif  doc not in verita:
          fp_count+=1

      else:
        if doc not in verita:
          fn_count+=1
        elif doc in verita:
          tn_count+=1
  
    Tn[s]=tn_count
    Tp[s]=tp_count
    Fp[s]=fp_count
    Fn[s]=fn_count
    #print('TP',Tp,'TF',Tf,'FN',Fn,'FP',Fp)
  return Tn,Tp,Fp,Fn 


def precision_recall(soglia,tp,fn,fp):
  precision=np.zeros(len((soglia)))
  recall=np.zeros(len(soglia))
  for s in range(len(soglia)):
    if (tp[soglia[s]]+fp[soglia[s]]) !=0:
      dp=tp[soglia[s]]/(tp[soglia[s]]+fp[soglia[s]])
      np.put(precision,s,dp,'raise')
    if (tp[soglia[s]]+fn[soglia[s]])!=0:
      dr=tp[soglia[s]]/(tp[soglia[s]]+fn[soglia[s]])
      np.put(recall,s,dr,'raise')
  return recall,precision

def Accuratezza(score,verita):
  soglia=[0]
  tp=tn=fp=fn=0
  for doc in score:
    if np.all(score[doc] < 0):
      if doc not in verita:
          fn+=1
      elif doc in verita:
          tn+=1
        
    else:
      if doc in verita:
          tp+=1
      elif  doc not in verita:
          fp+=1
  div=tp+tn
  divid=tp+tn+fp+fn
  return np.divide(div,divid)


def countZero(vect):
  count=0
  for i in range(len(vect)):
    if vect[i] == 0:
      print(i)
      count+=1
  print(count)

def extractValue(dict):
  score=np.zeros(len(dict))
  for i in range(len(dict)):
    np.put(score,i,dict[i],'raise')

In [0]:
score={}
weights_test={}
for cat in categories:
  score[cat],weights_test[cat]=test(test_docs,weights[cat],bias[cat],tfIdfTest)



In [0]:
for cat in categories:
  t=(soglia(score[cat]))
  print(t)
  tf,tp,fp,fn=calcoloTf(t,weights_test[cat],test_docs_in_class[cat])
  recall,precision=precision_recall(t,tp,fn,fp)
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.plot(recall,precision)
  print(cat)
  plt.show()
   