# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** *W*

**Names:**

* *Cloux Olivier*
* *Reiss Saskia*
* *Urien Thibault*

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [1]:
import pickle as pk
import numpy as np
import string
import re
import nltk
import time

from scipy.sparse import csr_matrix
from utils import load_json, load_pkl

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

import lab04_helper

In [2]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [11]:
courses = load_json('data/courses.txt') 
stopwords = load_pkl('data/stopwords.pkl')

In [None]:
def pickleDump(filename, value):
    with open(filename, "w") as f:
        pk.dump(filename, value)

## Exercise 4.1: Pre-processing

In [4]:
def removeStopWords(listWords):
    """
    Filters out stopwords in a list of words
    """
    return list(filter(lambda x : len(x)>0 and x not in stopwords, listWords))

def toWordList(description):
    """
    takes a description (unique string) and separates it to lowercase words (not distincts) 
    """
    return description.lower().split(" ") 

def takeOutNumbers(listWords):
    """
    Takes a list of words and removes all numbers that are alone or a only seperated by h.
    Permits to keep words that exist with a number (ex: 3SAT)
    """
    pattern = re.compile(r"\d{1,2}h\d{0,2}$") #removes hours
    noHours = [pattern.sub("", i) for i in listWords] 
    return list(filter(lambda x : not x.isdecimal(), noHours)) #filters out numbers-only

def splitAppendedWords(descString):
    """
    Takes a description, and splits appended words (because of a missing \\n)
    """
    patternAppended = re.compile("([a-z][a-z])([A-Z])([a-z][a-z])") #regex used to split bonded words 
    return patternAppended.sub("\\1 \\2\\3", descString)

def removePunctuation(descString):
    """
    Removes punctuation signs in a long unique string. Treats dashes smartly.
    """
    punct = ",.!?+\n\t:;0'%&\"#/()[]`\xa0\xad" #list of characters that always need to be removed
    puncttrans = str.maketrans(punct," "*len(punct)) #translation rule : replace above char by a space
    patternDash = re.compile(" +- *| *- +") # regex used to treat dashes
    
    unDashed = patternDash.sub("", descString) 
    return unDashed.translate(puncttrans)



def cleaner(oneCourse):
    """
    Calls all above functions. First remove punctuation, then un-append words, split to space, then remove numbers
    and stopwords. 
    """
    description = oneCourse['description']
    noPunct = removePunctuation(description) #desc without punctuation
    unAppended = splitAppendedWords(noPunct)  #desc with split words
    
    wordlist = toWordList(unAppended)
    return removeStopWords(takeOutNumbers(wordlist))

# TODO
explain why those functions

In [15]:
#Creation of a dictionary that contains :
#courses ID as keys
#a 3-tuple(uniqueIndex, title, list[separated words]) as value
descDict = dict() 
indexCourse = dict()
index = 0
for i in courses:
    if i['courseId'] not in descDict.keys():
        descDict[i['courseId']] = (index, i['name'], cleaner(i))
        indexCourse[index] = i['courseId']
        index += 1
with open(r"cidWithBag.txt", "wb") as f:
    pk.dump(descDict, f)
with open(r"indexToCourse.txt", "wb") as f:
    pk.dump(indexCourse, f)

In [6]:
#Creation of 2 dictionary.
#wordIndex contains all distinct words as keys and their unique index as value
#indexWord is the exact opposite. 
wordIndex = dict() 
index = 0
for i in descDict:
    for word in descDict[i][2]:
        if word not in wordIndex.keys():
            wordIndex[word] = index
            index += 1;

indexWord = dict((v, k) for k, v in wordIndex.items())
assert(len(indexWord) == len(wordIndex))
with open(r"indexToWord", "wb") as f:
    pk.dump(indexWord, f)

In [22]:
occValues = []
occRow = [] #indices of words
occCol = [] #indices of courses
i = 0
for cid in descDict:
    cIndex = descDict[cid][0]
    for word in descDict[cid][2]:
        occCol.append(cIndex)
        occRow.append(wordIndex[word])
        occValues.append(1)
occurenceMatrix = csr_matrix((occValues, (occRow, occCol)), shape=((len(wordIndex), len(descDict))), dtype=np.int8)
print(occurenceMatrix.shape)
np.save("occ_matrix",occurenceMatrix)

(15248, 854)


In [20]:
file = np.load("occ_matrix.npy")
print(file)

  (0, 32)	1
  (0, 141)	1
  (0, 155)	1
  (0, 161)	2
  (0, 207)	1
  (0, 363)	2
  (0, 386)	1
  (0, 423)	1
  (0, 481)	1
  (0, 562)	1
  (0, 571)	2
  (0, 714)	1
  (0, 760)	1
  (0, 779)	1
  (0, 817)	2
  (0, 835)	1
  (1, 9)	1
  (1, 10)	1
  (1, 12)	2
  (1, 15)	2
  (1, 16)	1
  (1, 17)	1
  (1, 19)	1
  (1, 20)	1
  (1, 21)	2
  :	:
  (15223, 708)	1
  (15224, 708)	1
  (15225, 708)	1
  (15226, 708)	1
  (15227, 708)	1
  (15228, 708)	1
  (15229, 708)	1
  (15230, 708)	1
  (15231, 547)	1
  (15232, 547)	1
  (15233, 547)	1
  (15234, 547)	1
  (15235, 547)	1
  (15236, 547)	1
  (15237, 547)	1
  (15238, 547)	1
  (15239, 547)	1
  (15240, 547)	1
  (15241, 547)	1
  (15242, 547)	1
  (15243, 547)	1
  (15244, 547)	1
  (15245, 547)	1
  (15246, 547)	1
  (15247, 547)	1


In [8]:
# ixIndex = descDict['COM-308'][0]
# ixLine = occurenceMatrix[ixIndex]
# ixWordsFreq = dict() #for each word, (TF, DF, score)
# for ixWord in descDict['COM-308'][2]:
#     wordRow = wordIndex[ixWord]
#     freq = occurenceMatrix[wordRow]
#     ixWordsFreq[ixWord] = (occurenceMatrix[wordRow,ixIndex], freq)
    
# print(ixWordsFreq)

## Exercise 4.2: Term-document matrix

## Exercise 4.3: Document similarity search