In [93]:
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
import collections
import math
import os
import numpy as np 

In [10]:
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"


In [11]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    #if "," in text: text = text.replace(",\"","\",")

    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")

    text = text.replace("!","!<stop>")
    #text = text.replace(",","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [35]:
text = "This Agreement of Sale is made and executed on this the day of at Secunderabadby and between: M/s. GREENWOOD ESTATES, a registered partnership firm, having its office at 5-4-187/3&4, II floor, Soham Mansion, M.G. Road, Secunderabad –500 003, represented by its Partners/ Authorised representatives Shri. Soham Modi, Son of Shri. Satish Modi aged about 37 years, Occupation: Business, resident of Plot No. 280, Jublee Hills, Hyderabad, and Smt.K. Sridevi, W/o. Shri. K.V.S. Reddy, aged about 32 years, R/o. Flat No. 502, Vasavi Homes,Uma Nagar, 1st lane, Begumpet, Hyderabad, hereinafter called the Vendor ."
sentences = split_into_sentences(text)

In [36]:
sentences

['This Agreement of Sale is made and executed on this the day of at Secunderabadby and between: M/s.',
 'GREENWOOD ESTATES, a registered partnership firm, having its office at 5-4-187/3&4, II floor, Soham Mansion, M.G. Road, Secunderabad –500 003, represented by its Partners/ Authorised representatives Shri.',
 'Soham Modi, Son of Shri.',
 'Satish Modi aged about 37 years, Occupation: Business, resident of Plot No.',
 '280, Jublee Hills, Hyderabad, and Smt.',
 'K.',
 'Sridevi, W/o.',
 'Shri.',
 'K.V.S. Reddy, aged about 32 years, R/o.',
 'Flat No.',
 '502, Vasavi Homes,Uma Nagar, 1st lane, Begumpet, Hyderabad, hereinafter called the Vendor .']

In [37]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
porter = PorterStemmer()

stemmer = nltk.stem.porter.PorterStemmer()
WORD = re.compile(r'\w+')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/divyansh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/divyansh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/divyansh/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/divyansh/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [38]:
stop = set(stopwords.words('english'))

In [39]:
def remove_stop_words(sentences) :
    tokenized_sentences = []
    for sentence in sentences :
        tokens = []
        split = sentence.lower().split()
        for word in split :
            if word not in stop :
                try :
                   
                    tokens.append(porter.stem(word))
                except :
                    tokens.append(word)
        
        tokenized_sentences.append(tokens)
        
    return tokenized_sentences

In [40]:
tokennized_setences = remove_stop_words(sentences)

In [41]:
tokennized_setences

[['agreement',
  'sale',
  'made',
  'execut',
  'day',
  'secunderabadbi',
  'between:',
  'm/s.'],
 ['greenwood',
  'estates,',
  'regist',
  'partnership',
  'firm,',
  'offic',
  '5-4-187/3&4,',
  'ii',
  'floor,',
  'soham',
  'mansion,',
  'm.g.',
  'road,',
  'secunderabad',
  '–500',
  '003,',
  'repres',
  'partners/',
  'authoris',
  'repres',
  'shri.'],
 ['soham', 'modi,', 'son', 'shri.'],
 ['satish',
  'modi',
  'age',
  '37',
  'years,',
  'occupation:',
  'business,',
  'resid',
  'plot',
  'no.'],
 ['280,', 'juble', 'hills,', 'hyderabad,', 'smt.'],
 ['k.'],
 ['sridevi,', 'w/o.'],
 ['shri.'],
 ['k.v.s.', 'reddy,', 'age', '32', 'years,', 'r/o.'],
 ['flat', 'no.'],
 ['502,',
  'vasavi',
  'homes,uma',
  'nagar,',
  '1st',
  'lane,',
  'begumpet,',
  'hyderabad,',
  'hereinaft',
  'call',
  'vendor',
  '.']]

In [42]:
def posTagger(tokenized_sentences) :
    tagged = []
    for sentence in tokenized_sentences :
        tag = nltk.pos_tag(sentence)
        tagged.append(tag)
    return tagged

In [43]:
tagged = posTagger(tokennized_setences)

In [44]:
tagged

[[('agreement', 'NN'),
  ('sale', 'NN'),
  ('made', 'VBD'),
  ('execut', 'JJ'),
  ('day', 'NN'),
  ('secunderabadbi', 'VBD'),
  ('between:', 'NN'),
  ('m/s.', 'NN')],
 [('greenwood', 'NN'),
  ('estates,', 'NNS'),
  ('regist', 'VBP'),
  ('partnership', 'NN'),
  ('firm,', 'JJ'),
  ('offic', 'JJ'),
  ('5-4-187/3&4,', 'JJ'),
  ('ii', 'NN'),
  ('floor,', 'NN'),
  ('soham', 'NN'),
  ('mansion,', 'NN'),
  ('m.g.', 'NN'),
  ('road,', 'NN'),
  ('secunderabad', 'VBD'),
  ('–500', '$'),
  ('003,', 'CD'),
  ('repres', 'NNS'),
  ('partners/', 'JJ'),
  ('authoris', 'JJ'),
  ('repres', 'NNS'),
  ('shri.', 'VBP')],
 [('soham', 'NN'), ('modi,', 'NN'), ('son', 'NN'), ('shri.', 'NN')],
 [('satish', 'JJ'),
  ('modi', 'NN'),
  ('age', 'NN'),
  ('37', 'CD'),
  ('years,', 'NN'),
  ('occupation:', 'NN'),
  ('business,', 'NN'),
  ('resid', 'NN'),
  ('plot', 'NN'),
  ('no.', 'NN')],
 [('280,', 'CD'),
  ('juble', 'JJ'),
  ('hills,', 'NN'),
  ('hyderabad,', 'NN'),
  ('smt.', 'NN')],
 [('k.', 'NN')],
 [('sridevi,'

In [45]:
def tfIsf(tokenized_sentences):
    scores = []
    COUNTS = []
    for sentence in tokenized_sentences :
        counts = collections.Counter(sentence)
        isf = []
        score = 0
        for word in counts.keys() :
            count_word = 1
            for sen in tokenized_sentences :
                for w in sen :
                    if word == w :
                        count_word += 1
            score = score + counts[word]*math.log(count_word-1)
        scores.append(score/len(sentence))
    return scores

In [50]:
scores = tfIsf(tokennized_setences)

In [51]:
scores

[0.0,
 0.1513358966832355,
 0.44793986730701374,
 0.20794415416798356,
 0.13862943611198905,
 0.0,
 0.0,
 1.0986122886681098,
 0.23104906018664842,
 0.34657359027997264,
 0.057762265046662105]

In [52]:
def similar(tokens_a, tokens_b) :
    #Using Jaccard similarity to calculate if two sentences are similar
    ratio = len(set(tokens_a).intersection(tokens_b))/ float(len(set(tokens_a).union(tokens_b)))
    return ratio



def similarityScores(tokenized_sentences) :
    scores = []
    for sentence in tokenized_sentences :
        score = 0;
        for sen in tokenized_sentences :
            if sen != sentence :
                score += similar(sentence,sen)
        scores.append(score)
    return scores

In [53]:
scores = similarityScores(tokennized_setences)

In [54]:
scores

[0.0,
 0.14090909090909093,
 0.34090909090909094,
 0.23376623376623376,
 0.0625,
 0.0,
 0.0,
 0.3,
 0.14285714285714285,
 0.09090909090909091,
 0.0625]

In [58]:
def properNounScores(tagged) :
    scores = []
    for i in range(len(tagged)) :
        score = 0
        for j in range(len(tagged[i])) :
            if(tagged[i][j][1]== 'NN' or tagged[i][j][1]=='NNPS') :
                score += 1
        scores.append(score/float(len(tagged[i])))
    return scores

In [59]:
prop_score = properNounScores(tagged)

In [60]:
prop_score

[0.625,
 0.38095238095238093,
 1.0,
 0.8,
 0.6,
 1.0,
 1.0,
 1.0,
 0.8333333333333334,
 0.5,
 0.5]

In [62]:
cwd = os.getcwd()
os.chdir(cwd + "/articles")

In [67]:
file = open('article19.txt','r')
text = file.read()

In [68]:
text

'AGREEMENT OF SALE\nThis Agreement of Sale is made and executed on this the day of at Secunderabad\nby and between:\nM/s. GREENWOOD ESTATES, a registered partnership firm, having its office at\n5-4-187/3&4, II floor, Soham Mansion, M.G. Road, Secunderabad –500 003, represented by its\nPartners/ Authorised representatives Shri. Soham Modi, Son of Shri. Satish Modi aged about\n37 years, Occupation: Business, resident of Plot No. 280, Jublee Hills, Hyderabad, and Smt.\nK. Sridevi, W/o. Shri. K.V.S. Reddy, aged about 32 years, R/o. Flat No. 502, Vasavi Homes,\nUma Nagar, 1st lane, Begumpet, Hyderabad, hereinafter called the "Vendor".\nA N D\n1. Shri. Karnati Bhaskar, S/o. Shri. K. Narsimha, aged about 41 years, Occupation Business,\nResident of H.No. 2-44/1, Sai Nagar, Chaitanyapuri, Dilshuknagar, Hyderabad.\n2. Shri. K. Gopinath, S/o. Shri. K. Bhaskar aged about 18 years, Occupation Business,\nResident of H.No. 2-44/1, Sai Nagar, Chaitanyapuri, Dilshuknagar, Hyderabad.\n3. Shri. A. Purush

In [69]:
sentences = split_into_sentences(text)

In [70]:
sentences

['AGREEMENT OF SALE This Agreement of Sale is made and executed on this the day of at Secunderabad by and between: M/s.',
 'GREENWOOD ESTATES, a registered partnership firm, having its office at 5-4-187/3&4, II floor, Soham Mansion, M.G. Road, Secunderabad –500 003, represented by its Partners/ Authorised representatives Shri.',
 'Soham Modi, Son of Shri.',
 'Satish Modi aged about 37 years, Occupation: Business, resident of Plot No.',
 '280, Jublee Hills, Hyderabad, and Smt.',
 'K. Sridevi, W/o.',
 'Shri.',
 'K.V.S. Reddy, aged about 32 years, R/o.',
 'Flat No.',
 '502, Vasavi Homes, Uma Nagar, 1st lane, Begumpet, Hyderabad, hereinafter called the "Vendor".',
 'A N D 1.',
 'Shri.',
 'Karnati Bhaskar, S/o.',
 'Shri.',
 'K. Narsimha, aged about 41 years, Occupation Business, Resident of H.No.',
 '2-44/1, Sai Nagar, Chaitanyapuri, Dilshuknagar, Hyderabad.',
 '2.',
 'Shri.',
 'K. Gopinath, S/o.',
 'Shri.',
 'K. Bhaskar aged about 18 years, Occupation Business, Resident of H.No.',
 '2-44/1

In [71]:
tokennized_setences = remove_stop_words(sentences)

In [72]:
tokennized_setences

[['agreement',
  'sale',
  'agreement',
  'sale',
  'made',
  'execut',
  'day',
  'secunderabad',
  'between:',
  'm/s.'],
 ['greenwood',
  'estates,',
  'regist',
  'partnership',
  'firm,',
  'offic',
  '5-4-187/3&4,',
  'ii',
  'floor,',
  'soham',
  'mansion,',
  'm.g.',
  'road,',
  'secunderabad',
  '–500',
  '003,',
  'repres',
  'partners/',
  'authoris',
  'repres',
  'shri.'],
 ['soham', 'modi,', 'son', 'shri.'],
 ['satish',
  'modi',
  'age',
  '37',
  'years,',
  'occupation:',
  'business,',
  'resid',
  'plot',
  'no.'],
 ['280,', 'juble', 'hills,', 'hyderabad,', 'smt.'],
 ['k.', 'sridevi,', 'w/o.'],
 ['shri.'],
 ['k.v.s.', 'reddy,', 'age', '32', 'years,', 'r/o.'],
 ['flat', 'no.'],
 ['502,',
  'vasavi',
  'homes,',
  'uma',
  'nagar,',
  '1st',
  'lane,',
  'begumpet,',
  'hyderabad,',
  'hereinaft',
  'call',
  '"vendor".'],
 ['n', '1.'],
 ['shri.'],
 ['karnati', 'bhaskar,', 's/o.'],
 ['shri.'],
 ['k.',
  'narsimha,',
  'age',
  '41',
  'years,',
  'occup',
  'business

In [73]:
tagged = posTagger(tokennized_setences)

In [74]:
tagged

[[('agreement', 'NN'),
  ('sale', 'NN'),
  ('agreement', 'NN'),
  ('sale', 'NN'),
  ('made', 'VBD'),
  ('execut', 'JJ'),
  ('day', 'NN'),
  ('secunderabad', 'VBD'),
  ('between:', 'NN'),
  ('m/s.', 'NN')],
 [('greenwood', 'NN'),
  ('estates,', 'NNS'),
  ('regist', 'VBP'),
  ('partnership', 'NN'),
  ('firm,', 'JJ'),
  ('offic', 'JJ'),
  ('5-4-187/3&4,', 'JJ'),
  ('ii', 'NN'),
  ('floor,', 'NN'),
  ('soham', 'NN'),
  ('mansion,', 'NN'),
  ('m.g.', 'NN'),
  ('road,', 'NN'),
  ('secunderabad', 'VBD'),
  ('–500', '$'),
  ('003,', 'CD'),
  ('repres', 'NNS'),
  ('partners/', 'JJ'),
  ('authoris', 'JJ'),
  ('repres', 'NNS'),
  ('shri.', 'VBP')],
 [('soham', 'NN'), ('modi,', 'NN'), ('son', 'NN'), ('shri.', 'NN')],
 [('satish', 'JJ'),
  ('modi', 'NN'),
  ('age', 'NN'),
  ('37', 'CD'),
  ('years,', 'NN'),
  ('occupation:', 'NN'),
  ('business,', 'NN'),
  ('resid', 'NN'),
  ('plot', 'NN'),
  ('no.', 'NN')],
 [('280,', 'CD'),
  ('juble', 'JJ'),
  ('hills,', 'NN'),
  ('hyderabad,', 'NN'),
  ('smt.',

In [75]:
scores = tfIsf(tokennized_setences)

In [76]:
scores

[0.4969813299576001,
 0.5337208535531195,
 1.3143738430069454,
 1.1666847547242483,
 0.2772588722239781,
 0.9985774245179969,
 2.772588722239781,
 1.0789817827253791,
 1.354025100551105,
 0.39895764523183713,
 0.0,
 2.772588722239781,
 0.5364793041447001,
 2.772588722239781,
 1.4999365567767549,
 0.9613868326322953,
 0.0,
 2.772588722239781,
 1.0729586082894003,
 2.772588722239781,
 1.4229202033812056,
 0.9613868326322953,
 0.0,
 2.772588722239781,
 0.9985774245179969,
 2.772588722239781,
 1.552159182248503,
 1.1337324605540517,
 0.9438267466893243,
 0.0,
 2.772588722239781,
 0.9985774245179969,
 2.772588722239781,
 1.4751428288529536,
 1.1337324605540517,
 0.9438267466893243,
 0.0,
 2.772588722239781,
 0.5364793041447001,
 2.772588722239781,
 1.3996054897495935,
 0.26823965207235007,
 2.0794415416798357,
 1.3143738430069454,
 1.1945063128187032,
 0.9985774245179969,
 2.772588722239781,
 0.9939626599152002,
 0.26163056782016575,
 0.0,
 0.0,
 0.46110993176424203]

In [77]:
scores = similarityScores(tokennized_setences)

In [78]:
scores

[0.14586056644880174,
 1.1782533670033672,
 3.290909090909091,
 2.3639249639249638,
 0.1625,
 0.38181818181818183,
 1.05,
 1.5794538794538795,
 1.4956709956709957,
 0.5330882352941176,
 0.0,
 1.05,
 0.8,
 1.05,
 3.2829151240915944,
 0.40847387906211435,
 0.0,
 1.05,
 1.3818181818181818,
 1.05,
 3.146551487727958,
 0.40847387906211435,
 0.0,
 1.05,
 1.2818181818181817,
 1.05,
 3.355642396818867,
 0.8166666666666667,
 0.2806417112299465,
 0.0,
 1.05,
 1.2818181818181817,
 1.05,
 3.219278760455231,
 0.8166666666666667,
 0.2806417112299465,
 0.0,
 1.05,
 0.8,
 1.05,
 2.9258658008658003,
 0.31756478815302347,
 6.590909090909091,
 3.290909090909091,
 1.6794538794538794,
 0.38181818181818183,
 1.05,
 1.5649456752397928,
 0.3314664502164502,
 0.0,
 0.0,
 0.444276856041562]

In [79]:
properNounScores(tagged)

[0.7,
 0.38095238095238093,
 1.0,
 0.8,
 0.6,
 1.0,
 1.0,
 0.8333333333333334,
 0.5,
 0.5,
 0.0,
 1.0,
 0.3333333333333333,
 1.0,
 0.7777777777777778,
 0.8333333333333334,
 0.0,
 1.0,
 1.0,
 1.0,
 0.8888888888888888,
 0.8333333333333334,
 0.0,
 1.0,
 1.0,
 1.0,
 0.8888888888888888,
 0.3333333333333333,
 0.6666666666666666,
 0.0,
 1.0,
 1.0,
 1.0,
 0.8888888888888888,
 0.3333333333333333,
 0.6666666666666666,
 0.0,
 1.0,
 0.6666666666666666,
 1.0,
 0.75,
 0.6666666666666666,
 0.0,
 1.0,
 0.6666666666666666,
 1.0,
 1.0,
 0.7,
 0.7692307692307693,
 0.3333333333333333,
 0.0,
 0.875]

In [80]:
def text_to_vector(text):
    words = WORD.findall(text)
    return collections.Counter(words)


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def centroidSimilarity(sentences,tfIsfScore) :
    centroidIndex = tfIsfScore.index(max(tfIsfScore))
    scores = []
    for sentence in sentences :
        vec1 = text_to_vector(sentences[centroidIndex])
        vec2 = text_to_vector(sentence)
        
        score = get_cosine(vec1,vec2)
        scores.append(score)
    return scores


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def numericToken(tokenized_sentences):
    scores = []
    for sentence in tokenized_sentences :
        score = 0
        for word in sentence :
            if is_number(word) :
                score +=1 
        scores.append(score/float(len(sentence)))
    return scores


def namedEntityRecog(sentences) :
    counts = []
    for sentence in sentences :
        count = entity2.ner(sentence)
        counts.append(count)
    return counts


def sentencePos(sentences) :
    th = 0.2
    minv = th*len(sentences)
    maxv = th*2*len(sentences)
    pos = []
    for i in range(len(sentences)):
        if i==0 or i==len((sentences)):
            pos.append(0)
        else:
            t = math.cos((i-minv)*((1/maxv)-minv))
            pos.append(t)

    return pos


def sentenceLength(tokenized_sentences) :
    count = []
    maxLength = sys.maxsize
    for sentence in tokenized_sentences:
        num_words = 0
        for word in sentence :
                num_words +=1
        if num_words < 3 :
            count.append(0)
        else :
            count.append(num_words)
    
    count = [1.0*x/(maxLength) for x in count]
    return count

def thematicFeature(tokenized_sentences) :
    word_list = []
    for sentence in tokenized_sentences :
        for word in sentence :
            try:
                word = ''.join(e for e in word if e.isalnum())
                #print(word)
                word_list.append(word)
            except Exception as e:
                print("ERR")
    counts = Counter(word_list)
    number_of_words = len(counts)
    most_common = counts.most_common(10)
    thematic_words = []
    for data in most_common :
        thematic_words.append(data[0])
    print(thematic_words)
    scores = []
    for sentence in tokenized_sentences :
        score = 0
        for word in sentence :
            try:
                word = ''.join(e for e in word if e.isalnum())
                if word in thematic_words :
                    score = score + 1
                #print(word)
            except Exception as e:
                print("ERR")
        score = 1.0*score/(number_of_words)
        scores.append(score)
    return scores

def upperCaseFeature(sentences) :
    tokenized_sentences2 = remove_stop_words_without_lower(sentences)
    #print(tokenized_sentences2)
    upper_case = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    scores = []
    for sentence in tokenized_sentences2 :
        score = 0
        for word in sentence :
            if word[0] in upper_case :
                score = score + 1
        scores.append(1.0*score/len(sentence))
    return scores

def cuePhraseFeature(sentences) :
    pass

def sentencePosition(paragraphs):
    scores = []
    for para in paragraphs:
        sentences = split_into_sentences(para)
        print(len(sentences))
        if len(sentences) == 1 :
            scores.append(1.0)
        elif len(sentences) == 2 :
            scores.append(1.0)
            scores.append(1.0)
        else :
            scores.append(1.0)
            for x in range(len(sentences)-2) :
                scores.append(0.0)
            scores.append(1.0)
    return scores

In [83]:
class RBM:
  
    def __init__(self, num_visible, num_hidden, learning_rate = 0.1):
        self.num_hidden = num_hidden
        self.num_visible = num_visible
        self.learning_rate = learning_rate

        # Initialize a weight matrix, of dimensions (num_visible x num_hidden), using
        # a Gaussian distribution with mean 0 and standard deviation 0.1.
        self.weights = 0.1 * np.random.randn(self.num_visible, self.num_hidden)    
        # Insert weights for the bias units into the first row and first column.
        self.weights = np.insert(self.weights, 0, 0, axis = 0)
        self.weights = np.insert(self.weights, 0, 0, axis = 1)

    def train(self, data, max_epochs = 1000):
        """
        Train the machine.

        Parameters
        ----------
        data: A matrix where each row is a training example consisting of the states of visible units.    
        """

        num_examples = data.shape[0]

        # Insert bias units of 1 into the first column.
        data = np.insert(data, 0, 1, axis = 1)

        for epoch in range(max_epochs):      
            # Clamp to the data and sample from the hidden units. 
            # (This is the "positive CD phase", aka the reality phase.)
            pos_hidden_activations = np.dot(data, self.weights)      
            pos_hidden_probs = self._logistic(pos_hidden_activations)
            pos_hidden_states = pos_hidden_probs > np.random.rand(num_examples, self.num_hidden + 1)
            # Note that we're using the activation *probabilities* of the hidden states, not the hidden states       
            # themselves, when computing associations. We could also use the states; see section 3 of Hinton's 
            # "A Practical Guide to Training Restricted Boltzmann Machines" for more.
            pos_associations = np.dot(data.T, pos_hidden_probs)

            # Reconstruct the visible units and sample again from the hidden units.
            # (This is the "negative CD phase", aka the daydreaming phase.)
            neg_visible_activations = np.dot(pos_hidden_states, self.weights.T)
            neg_visible_probs = self._logistic(neg_visible_activations)
            neg_visible_probs[:,0] = 1 # Fix the bias unit.
            neg_hidden_activations = np.dot(neg_visible_probs, self.weights)
            neg_hidden_probs = self._logistic(neg_hidden_activations)
            # Note, again, that we're using the activation *probabilities* when computing associations, not the states 
            # themselves.
            neg_associations = np.dot(neg_visible_probs.T, neg_hidden_probs)

            # Update weights.
            self.weights += self.learning_rate * ((pos_associations - neg_associations) / num_examples)

            error = np.sum((data - neg_visible_probs) ** 2)
            print("Epoch %s: error is %s" % (epoch, error))

    def run_visible(self, data):
        """
        Assuming the RBM has been trained (so that weights for the network have been learned),
        run the network on a set of visible units, to get a sample of the hidden units.
    
        Parameters
        ----------
        data: A matrix where each row consists of the states of the visible units.
    
        Returns
        -------
        hidden_states: A matrix where each row consists of the hidden units activated from the visible
        units in the data matrix passed in.
        """
    
        num_examples = data.shape[0]
    
        # Create a matrix, where each row is to be the hidden units (plus a bias unit)
        # sampled from a training example.
        hidden_states = np.ones((num_examples, self.num_hidden + 1))
    
        # Insert bias units of 1 into the first column of data.
        data = np.insert(data, 0, 1, axis = 1)

        # Calculate the activations of the hidden units.
        hidden_activations = np.dot(data, self.weights)
        # Calculate the probabilities of turning the hidden units on.
        hidden_probs = self._logistic(hidden_activations)
        # Turn the hidden units on with their specified probabilities.
        hidden_states[:,:] = hidden_probs > np.random.rand(num_examples, self.num_hidden + 1)
        # Always fix the bias unit to 1.
        # hidden_states[:,0] = 1
  
        # Ignore the bias units.
        hidden_states = hidden_states[:,1:]
        return hidden_states
    
      # TODO: Remove the code duplication between this method and `run_visible`?
    def run_hidden(self, data):
        """
        Assuming the RBM has been trained (so that weights for the network have been learned),
        run the network on a set of hidden units, to get a sample of the visible units.

        Parameters
        ----------
        data: A matrix where each row consists of the states of the hidden units.

        Returns
        -------
        visible_states: A matrix where each row consists of the visible units activated from the hidden
        units in the data matrix passed in.
        """

        num_examples = data.shape[0]

        # Create a matrix, where each row is to be the visible units (plus a bias unit)
        # sampled from a training example.
        visible_states = np.ones((num_examples, self.num_visible + 1))

        # Insert bias units of 1 into the first column of data.
        data = np.insert(data, 0, 1, axis = 1)

        # Calculate the activations of the visible units.
        visible_activations = np.dot(data, self.weights.T)
        # Calculate the probabilities of turning the visible units on.
        visible_probs = self._logistic(visible_activations)
        # Turn the visible units on with their specified probabilities.
        visible_states[:,:] = visible_probs > np.random.rand(num_examples, self.num_visible + 1)
        # Always fix the bias unit to 1.
        # visible_states[:,0] = 1

        # Ignore the bias units.
        visible_states = visible_states[:,1:]
        return visible_states
    
    def daydream(self, num_samples):
        """
        Randomly initialize the visible units once, and start running alternating Gibbs sampling steps
        (where each step consists of updating all the hidden units, and then updating all of the visible units),
        taking a sample of the visible units at each step.
        Note that we only initialize the network *once*, so these samples are correlated.

        Returns
        -------
        samples: A matrix, where each row is a sample of the visible units produced while the network was
        daydreaming.
        """

        # Create a matrix, where each row is to be a sample of of the visible units 
        # (with an extra bias unit), initialized to all ones.
        samples = np.ones((num_samples, self.num_visible + 1))

        # Take the first sample from a uniform distribution.
        samples[0,1:] = np.random.rand(self.num_visible)

        # Start the alternating Gibbs sampling.
        # Note that we keep the hidden units binary states, but leave the
        # visible units as real probabilities. See section 3 of Hinton's
        # "A Practical Guide to Training Restricted Boltzmann Machines"
        # for more on why.
        for i in range(1, num_samples):
            visible = samples[i-1,:]

            # Calculate the activations of the hidden units.
            hidden_activations = np.dot(visible, self.weights)      
            # Calculate the probabilities of turning the hidden units on.
            hidden_probs = self._logistic(hidden_activations)
            # Turn the hidden units on with their specified probabilities.
            hidden_states = hidden_probs > np.random.rand(self.num_hidden + 1)
            # Always fix the bias unit to 1.
            hidden_states[0] = 1

            # Recalculate the probabilities that the visible units are on.
            visible_activations = np.dot(hidden_states, self.weights.T)
            visible_probs = self._logistic(visible_activations)
            visible_states = visible_probs > np.random.rand(self.num_visible + 1)
            samples[i,:] = visible_states

        # Ignore the bias units (the first column), since they're always set to 1.
        return samples[:,1:]        
      
    def _logistic(self, x):
        return 1.0 / (1 + np.exp(-x))

In [90]:
tfIsfScore = tfIsf(tokennized_setences)
similarityScore = similarityScores(tokennized_setences)
properNounScore = properNounScores(tagged)
centroidSimilarityScore = centroidSimilarity(sentences,tfIsfScore)
numericTokenScore = numericToken(tokennized_setences)
featureMatrix = []
featureMatrix.append(tfIsfScore)
featureMatrix.append(similarityScore)
featureMatrix.append(properNounScore)
featureMatrix.append(centroidSimilarityScore)
featureMatrix.append(numericTokenScore)

In [91]:
featureMatrix

[[0.4969813299576001,
  0.5337208535531195,
  1.3143738430069454,
  1.1666847547242483,
  0.2772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.0789817827253791,
  1.354025100551105,
  0.39895764523183713,
  0.0,
  2.772588722239781,
  0.5364793041447001,
  2.772588722239781,
  1.4999365567767549,
  0.9613868326322953,
  0.0,
  2.772588722239781,
  1.0729586082894003,
  2.772588722239781,
  1.4229202033812056,
  0.9613868326322953,
  0.0,
  2.772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.552159182248503,
  1.1337324605540517,
  0.9438267466893243,
  0.0,
  2.772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.4751428288529536,
  1.1337324605540517,
  0.9438267466893243,
  0.0,
  2.772588722239781,
  0.5364793041447001,
  2.772588722239781,
  1.3996054897495935,
  0.26823965207235007,
  2.0794415416798357,
  1.3143738430069454,
  1.1945063128187032,
  0.9985774245179969,
  2.772588722239781,
  0.9939626599152002,
  0.26163056782016575,
  0.0,
 

In [94]:
featureMat = np.zeros((len(sentences),5))
for i in range(5) :
    for j in range(len(sentences)):
        featureMat[j][i] = featureMatrix[i][j]

In [95]:
featureMat

array([[0.49698133, 0.14586057, 0.7       , 0.        , 0.        ],
       [0.53372085, 1.17825337, 0.38095238, 0.16666667, 0.        ],
       [1.31437384, 3.29090909, 1.        , 0.4472136 , 0.        ],
       [1.16668475, 2.36392496, 0.8       , 0.        , 0.1       ],
       [0.27725887, 0.1625    , 0.6       , 0.        , 0.        ],
       [0.99857742, 0.38181818, 1.        , 0.        , 0.        ],
       [2.77258872, 1.05      , 1.        , 1.        , 0.        ],
       [1.07898178, 1.57945388, 0.83333333, 0.        , 0.16666667],
       [1.3540251 , 1.495671  , 0.5       , 0.        , 0.        ],
       [0.39895765, 0.53308824, 0.5       , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.5       ],
       [2.77258872, 1.05      , 1.        , 1.        , 0.        ],
       [0.5364793 , 0.8       , 0.33333333, 0.        , 0.        ],
       [2.77258872, 1.05      , 1.        , 1.        , 0.        ],
       [1.49993656, 3.28291512, 0.

In [96]:
r = RBM(num_visible= 5, num_hidden=5)
training_Data = featureMat
r.train(training_Data, max_epochs=100)
print(r.weights)
user = np.array([[0,0.4,0,0.5,1.2]])
print(r.run_visible(user))

Epoch 0: error is 216.2656191972279
Epoch 1: error is 210.5256521304998
Epoch 2: error is 205.1944332832461
Epoch 3: error is 195.98423716920763
Epoch 4: error is 190.76802119120072
Epoch 5: error is 184.0454822866675
Epoch 6: error is 178.13156804797518
Epoch 7: error is 170.75446445343516
Epoch 8: error is 164.04006391561194
Epoch 9: error is 161.4733476739687
Epoch 10: error is 157.40799940713092
Epoch 11: error is 155.72104526814033
Epoch 12: error is 153.7201217128968
Epoch 13: error is 151.65696236253737
Epoch 14: error is 151.67527459724266
Epoch 15: error is 150.79587131031934
Epoch 16: error is 149.19224642067948
Epoch 17: error is 149.75636955537163
Epoch 18: error is 147.69507331381834
Epoch 19: error is 145.95648226401747
Epoch 20: error is 149.14050843638893
Epoch 21: error is 150.7016142788873
Epoch 22: error is 149.74546627008647
Epoch 23: error is 147.7510712447518
Epoch 24: error is 145.8013676288976
Epoch 25: error is 148.9771261924318
Epoch 26: error is 148.477655532

In [98]:
featureMatrix

[[0.4969813299576001,
  0.5337208535531195,
  1.3143738430069454,
  1.1666847547242483,
  0.2772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.0789817827253791,
  1.354025100551105,
  0.39895764523183713,
  0.0,
  2.772588722239781,
  0.5364793041447001,
  2.772588722239781,
  1.4999365567767549,
  0.9613868326322953,
  0.0,
  2.772588722239781,
  1.0729586082894003,
  2.772588722239781,
  1.4229202033812056,
  0.9613868326322953,
  0.0,
  2.772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.552159182248503,
  1.1337324605540517,
  0.9438267466893243,
  0.0,
  2.772588722239781,
  0.9985774245179969,
  2.772588722239781,
  1.4751428288529536,
  1.1337324605540517,
  0.9438267466893243,
  0.0,
  2.772588722239781,
  0.5364793041447001,
  2.772588722239781,
  1.3996054897495935,
  0.26823965207235007,
  2.0794415416798357,
  1.3143738430069454,
  1.1945063128187032,
  0.9985774245179969,
  2.772588722239781,
  0.9939626599152002,
  0.26163056782016575,
  0.0,
 

In [99]:
r

<__main__.RBM at 0x7f95d8f88bb0>