# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** *W*

**Names:**

* *Cloux Olivier*
* *Reiss Saskia*
* *Urien Thibault*

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [None]:
import pickle
import numpy as np
import string
import re
import nltk
import time

from scipy.sparse import csr_matrix
from utils import load_json, load_pkl

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer



In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
courses = load_json('data/courses.txt') 
stopwords = load_pkl('data/stopwords.pkl')

## Exercise 4.1: Pre-processing

In [None]:
def removeStopWords(listWords):
    """
    Filters out stopwords in a list of words
    """
    return list(filter(lambda x : len(x)>0 and x not in stopwords, listWords))

def toWordList(description):
    """
    - Removes punctuation but treats smartly words with dash (e.g. keeps dash in 'k-mean')
    - Some words were bonded together because of missing newline (yielding words as 'wordAnotherword'). We split those.
    - Puts all words in lowercase.
    - Returns a list of cleaned words. 
    
    Keyword arguments:
    description -- A unique, long string
    """
    return description.lower().split(" ") 

def takeOutNumbers(listWords):
    """
    Removes all numbers that are alone or a only seperated by h.
    Permits to keep words that exist with a number (ex: 3SAT)
    """
    pattern = re.compile(r"\d{1,2}h\d{0,2}$")
    noHours = [pattern.sub("", i) for i in listWords]
    return list(filter(lambda x : not x.isdecimal(), noHours))

def splitAppendedWords(descString):
    patternAppended = re.compile("([a-z][a-z])([A-Z])([a-z][a-z])") #regex used to split bonded words 
    return patternAppended.sub("\\1 \\2\\3", descString)

def removePunctuation(descString):
    punct = ",.!?+\n\t:;0'%&\"#/()[]`\xa0" #list of characters that always need to be removed
    puncttrans = str.maketrans(punct," "*len(punct)) #translation rule : replace above char by a space
    patternDash = re.compile("\ +-\ *|\ *-\ +") # regex used to treat dashes
    
    unDashed = patternDash.sub("", descString) 
    return unDashed.translate(puncttrans)



def cleaner(oneCourse):
    description = oneCourse['description']
    noPunct = removePunctuation(description) #desc without punctuation
    unAppended = splitAppendedWords(noPunct)  #desc with split words
    
    wordlist = toWordList(unAppended)
    return removeStopWords(takeOutNumbers(wordlist))

# def veryFrequent():
    
# def inFrequent():
    
# def stemming():

# def lemmatise():
    
# def ngram():

# def blackMagic():



In [None]:
descDict = dict() #will contain all courses, and a tuple(uniqueIndex, title, list[separated words])
index = 0
for i in courses:
    if i['courseId'] not in descDict.keys():
        descDict[i['courseId']] = (index, i['name'], cleaner(i))
    #     print(i['courseId'], index)
        index += 1


In [None]:
wordIndex = dict() #will contain all distinct words and a unique index
index = 0
for i in descDict:
    for word in descDict[i][2]:
        if word not in wordIndex.keys():
            wordIndex[word] = index
            index += 1;

#invert dict that wordIndex : each index is mapped to a distinct word
indexWord = dict((v, k) for k, v in wordIndex.items()) 
assert(len(indexWord) == len(wordIndex))

In [None]:
bigBadassMatrix = np.zeros((len(wordIndex), len(descDict)))
for courseId in descDict:
    column = descDict[courseId][0]
    for word in descDict[courseId][2]:
        wid = wordIndex[word]
        bigBadassMatrix[wid][column] += 1

In [None]:
ixIndex = descDict['COM-308'][0]
ixLine = bigBadassMatrix[ixIndex]
ixWordsFreq = dict() #for each word, (TF, DF, score)
for ixWord in descDict['COM-308'][2]:
    wordRow = wordIndex[ixWord]
    freq = sum(bigBadassMatrix[wordRow])
    ixWordsFreq[ixWord] = (bigBadassMatrix[wordRow][ixIndex], freq)
    
print(ixWordsFreq)

## Exercise 4.2: Term-document matrix

## Exercise 4.3: Document similarity search