In [1]:
import pandas as pd
import re
import string
import nltk
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [2]:
if __name__ == "__main__":  
    
    # Test Q1
    
    def extract(text):

        data = None

        total = re.split(r'	', text)
        total = total[:-1]
        symb0 = total[::8]
        symb = tuple(i.replace('\n', '') if '\n' in i else i for i in symb0)
        name = tuple(total[1::8])
        price = tuple(total[2::8])
        change = tuple(total[4::8])
        per_change = tuple(total[5::8])
        vol = tuple(total[6::8])
        cap = tuple(total[7::8])
        data = pd.DataFrame([symb, name, price, change, per_change, vol, cap], \
                 index = ['Symbol', 'Name', 'Last Price', 'Change', '% Change', 'Volume', 'Market Cap']).T

        return data

    
    text='''STNE	StoneCo Ltd.	13.57	12:04PM EDT	+3.97	+41.26%	43.307M	4.534B	
GME	GameStop Corp.	89.82	12:04PM EDT	+2.12	+2.42%	4.849M	7.248B	
NVDA	NVIDIA Corporation	259.93	12:04PM EDT	+12.27	+4.95%	36.552M	647.746B	
DIDI	DiDi Global Inc.	3.8589	12:04PM EDT	+1.2989	+50.74%	156.157M	18.244B	
FDX	FedEx Corporation	215.76	12:04PM EDT	-12.22	-5.36%	5.036M	57.17B	
PIK	Kidpik Corp.	7.19	12:04PM EDT	+2.44	+51.30%	83.298M	54.746M	'''

    
    print("\n==================\n")
    print("Test Q1")
    print(extract(text))
    
    def tokenize(doc, lemmatized=True, remove_stopword=True):

        tokens =[]

        def pos(x):
            if x.startswith('J'):
                return 'a'
            elif x.startswith('V'):
                return 'v'
            elif x.startswith('N'):
                return 'n'
            elif x.startswith('R'):
                return 'r'
            else:
                return 'n'

        tokens = nltk.word_tokenize(doc)

        if lemmatized == True:
            pos_list = nltk.pos_tag(tokens)
            tokens = list(map(lambda x: WordNetLemmatizer().lemmatize(x[0], pos(x[1])), pos_list))

        if remove_stopword == True:
            stop_words = stopwords.words('english')
            tokens = list(i for i in tokens if i.lower() not in stop_words)

        tokens = list(map(lambda x: x.lower(), tokens))
        tokens=[i.strip(string.punctuation) for i in tokens if i.strip(string.punctuation)!='']

        return tokens
    
    def compute_tfidf(docs, lemmatized=True, remove_stopword=True):
    
        smoothed_tf_idf = None

        docs_dic = {idx: nltk.FreqDist(tokenize(doc, lemmatized, remove_stopword)) for idx, doc in enumerate(docs)}
        docs_freq = pd.DataFrame.from_dict(docs_dic).T.fillna(0)
        doc_len = docs_freq.values.sum(axis=1)
        tf=np.divide(docs_freq.values, doc_len[:,None])
        df = np.where(tf>0,1,0)
        idf = np.log(np.divide(len(docs)+1, np.sum(df, axis=0)+1))+1
        smoothed_tf_idf = normalize(tf*idf)

        return smoothed_tf_idf

    def Match(questions, answers, lemmatized = True, remove_stopword = True, top_K = 3):

        result = []

        total = questions + answers
        total_tfidf = compute_tfidf(total, lemmatized, remove_stopword)
        A_tfidf = total_tfidf[len(questions):(len(questions)+len(answers))]

        for i in range(len(questions)):
            q_tfidf = total_tfidf[i]
            q_A = np.row_stack((q_tfidf, A_tfidf))
            similarity=1-pairwise_distances(q_A, metric = 'cosine')
            result.append(similarity[0].argsort()[::-1][1:top_K+1]-1)

        return result  

    def calculate_hit_rate(question, answer, found_answers):

        hit_rate = None

        success = []
        for i in range(len(question)):
            if answer[answer.qid == i].index[0] in found_answers[i]:
                success.append(1)
            else:
                success.append(0)

        hit_rate = np.mean(success)

        return hit_rate
    
    question = pd.read_csv("hw4_question.csv")
    answer = pd.read_csv("hw4_answer.csv")
    
    print("\n==================\n")
    print("Test Q2.1 - Try different parameter values to make sure all options work\n")
    
    print("1.lemmatized=True, remove_stopword=True\n", tokenize(answer["answer"].loc[0], lemmatized=True, remove_stopword=True))

    print("\n2.lemmatized=False, remove_stopword=True\n",tokenize(answer["answer"].loc[0], lemmatized=False, remove_stopword=True))

    print("\n3.lemmatized=False, remove_stopword=False\n",tokenize(answer["answer"].loc[0], lemmatized=False, remove_stopword=False))
       
    print("\n==================\n")
    print("Test Q2.2")
    print("1.lemmatized=True, remove_stopword=True\n", compute_tfidf(answer["answer"], lemmatized=True, remove_stopword=True).shape)

    print("\n2.lemmatized=False, remove_stopword=True\n",compute_tfidf(answer["answer"], lemmatized=False, remove_stopword=True).shape)

    print("\n3.lemmatized=False, remove_stopword=False\n",compute_tfidf(answer["answer"], lemmatized=False, remove_stopword=False).shape)
    
    print("\n==================\n")
    print("Test Q2.3 ")
    
    top_index = Match(question.question.tolist(), answer.answer.tolist(), \
                  lemmatized = True, remove_stopword = True, top_K = 3)

    print(f"{question.iloc[0].qid}: {question.iloc[0].question}")
    print(f"\nTop 3 answers:\n")
    for i in top_index[0]:
        print(f'{answer.iloc[i]["qid"]}: {answer.iloc[i]["answer"]}\n')

    
    print("\n==================\n")
    print("Test Q2.4")
    for k in [1, 3, 5]:
        top_index = Match(question.question.tolist(), answer.answer.tolist(), \
                      lemmatized = True, remove_stopword = True, top_K = k)

        hr = calculate_hit_rate(question, answer, top_index)
        print(f"Top-{k}: {hr:.4f}")
        
    print("\n==================\n")
    print("Test Q2.6")
    
    ## Since we cannot calculate the tf-idf of the new questions to get their similarities with answers we have
    ## we shall manually get the feature words that can representate the whole question

    def customize_stopwords(docs, stop_words_number, lemmatized=True, remove_stopword=True):

        docs_dic = {idx: nltk.FreqDist(tokenize(doc, lemmatized, remove_stopword)) for idx, doc in enumerate(docs)}
        docs_freq = pd.DataFrame.from_dict(docs_dic).T.fillna(0)
        doc_len = docs_freq.values.sum(axis=1)
        tf=np.divide(docs_freq.values, doc_len[:,None])
        df = np.where(tf>0,1,0)
        idf = np.log(np.divide(len(docs)+1, np.sum(df, axis=0)+1))+1
        stop_words = list(map(lambda i: docs_freq.columns[i], idf.argsort()[0:stop_words_number]))

        return stop_words      

    ## Next we design a new function, matching the tokens in the new question to tokens in the answers
    ## The index of the best answer(s) along with the corresponding matching degree will be returned
    ## If the tokens of a question are [a, b, c], and an answer's tokens only include [a, b] 
    ## then the matching degree would be 2/3

    def Match_new(new_question, stop_words_number, match_number, questions, answers, lemmatized = True, remove_stopword = True):

        total = questions + answers
        new_stop = customize_stopwords(total, stop_words_number, lemmatized, remove_stopword)
        new_Q = tokenize(new_question, lemmatized, remove_stopword)
        keywords_Q = [i for i in new_Q if i not in new_stop]

        list_answers = [tokenize(answer, lemmatized, remove_stopword) for answer in answers]
        match_score = np.array(list(map(lambda x: np.sum([1 if i in x else 0 for i in keywords_Q]), list_answers)))
        matching_index = match_score.argsort()[::-1][0:match_number]
        matching_degree = np.divide(match_score, len(keywords_Q))
        result = {idx: np.divide(match_score, len(keywords_Q))[idx] for idx in matching_index}

        return result 

    ## Test our function

    doc1 = 'What is multisystem inflammatory syndrome associated with COVID-19?'
    doc2 = '''When administering a third dose of an mRNA vaccine to 
              eligible individuals as part of the primary series, should 
              the same vaccine type as the initial two doses be used?'''

    questions = question.question.tolist()
    answers = answer.answer.tolist()

    print('doc1:\n', Match_new(doc1, 10, 5, questions, answers, lemmatized = True, remove_stopword = True))
    print('doc2:\n', Match_new(doc2, 10, 5, questions, answers, lemmatized = True, remove_stopword = True))

    ## As seen, the performance of the QA systme is pretty awful
    ## that's because we've abandoned the use of cosine similarity, developing the system in a non-mathmatical way

    print('best answers for doc1:\n', answers[91], '\n')
    print('best answers for doc2:\n', answers[46])



Test Q1
  Symbol                Name Last Price   Change % Change    Volume Market Cap
0   STNE        StoneCo Ltd.      13.57    +3.97  +41.26%   43.307M     4.534B
1    GME      GameStop Corp.      89.82    +2.12   +2.42%    4.849M     7.248B
2   NVDA  NVIDIA Corporation     259.93   +12.27   +4.95%   36.552M   647.746B
3   DIDI    DiDi Global Inc.     3.8589  +1.2989  +50.74%  156.157M    18.244B
4    FDX   FedEx Corporation     215.76   -12.22   -5.36%    5.036M     57.17B
5    PIK        Kidpik Corp.       7.19    +2.44  +51.30%   83.298M    54.746M


Test Q2.1 - Try different parameter values to make sure all options work

1.lemmatized=True, remove_stopword=True
 ['practice', 'thing', 'washing', 'hand', 'soap', 'water', 'least', '20', 'second', 'avoid', 'people', 'sick', 'cleaning', 'disinfect', 'high', 'touch', 'surface', 'household', 'common', 'area', 'e.g', 'table', 'chair', 'doorknob', 'etc', 'launder', 'item', 'like', 'plush', 'toy']

2.lemmatized=False, remove_stopword=Tr