In [None]:
import math
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import nltk, nltk.stem, nltk.corpus, nltk.tokenize # if missing downloads, please run downloads below (only need once)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize

In [None]:
# nltk for data cleaning. Only need to be downloaded once

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Load data from csv files, change to json later

K_path = "TFIDF-K.csv"
K_df = pd.read_csv(K_path)
Y_path = "TFIDF-Y.csv"
Y_df = pd.read_csv(Y_path)
N_path = "TFIDF-N.csv" 
N_df = pd.read_csv(N_path)
H_path = "TFIDF-H.csv"
H_df = pd.read_csv(H_path)
Z_path = "TFIDF-Z.csv"
Z_df = pd.read_csv(Z_path)

Data pre-processing with stopword removal, lemmatization, and punctuation removal

In [None]:
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english') + list(string.punctuation))

def clean_text(text):
  lst_tokens = [i for i in word_tokenize(text.lower()) if i not in stop]
  lemmatized_lst = []
  for token in lst_tokens:
    lemmatized_token = lemmatizer.lemmatize(token)
    lemmatized_lst.append(lemmatized_token)
  lemmatized_sentence = " ".join(lemmatized_lst)
  return lemmatized_sentence

def filter_description(descriptions):
  final_desc = []
  for description in descriptions:
    lst_tokens = [i for i in word_tokenize(description.lower()) if i not in stop]
    lemmatized_lst = []
    for token in lst_tokens:
      lemmatized_token = lemmatizer.lemmatize(token)
      lemmatized_lst.append(lemmatized_token)
    lemmatized_sentence = " ".join(lemmatized_lst)
    final_desc.append(lemmatized_sentence)
  return final_desc

In [None]:
K_df = K_df.dropna()
Y_df = Y_df.dropna()
H_df = H_df.dropna()
Z_df = Z_df.dropna()
N_df = N_df.dropna()

K_df = K_df.applymap(clean_text)
Y_df = Y_df.applymap(clean_text)
H_df = H_df.applymap(clean_text)
Z_df = Z_df.applymap(clean_text)
N_df = N_df.applymap(clean_text)

cols = ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
# get flattened list of descriptions for each data sheet
K_descriptions = [x for xs in K_df[cols].values for x in xs]
Y_descriptions = [x for xs in Y_df[cols].values for x in xs]
H_descriptions = [x for xs in H_df[cols].values for x in xs]
Z_descriptions = [x for xs in Z_df[cols].values for x in xs]
N_descriptions = [x for xs in N_df[cols].values for x in xs]

K_cleaned = filter_description(K_descriptions)
Y_cleaned = filter_description(Y_descriptions)
H_cleaned = filter_description(H_descriptions)
Z_cleaned = filter_description(Z_descriptions)
N_cleaned = filter_description(N_descriptions)

descriptions = K_cleaned + Y_cleaned + H_cleaned + Z_cleaned + N_cleaned

Calculate document frequency across the entire corpus

In [None]:
doc_freq = {}

# calculate document frequency for terms based on corpus of descriptions
def calc_df(docs):
  for doc in docs:
    words = doc.split(' ')
    words = list(set(words))
    for w in words:
      if w in doc_freq.keys():
        doc_freq[w] += 1
      else:
        doc_freq.update({w:1})

calc_df(descriptions)

Choose correct option by summing up tf-idf for query terms

In [None]:
# General function to compute tf-idf for per-person data (but using entire corpus)

def tfidf_score(df, name):
  results = []
  correct = 0
  total = len(df)
  for i in range(len(df)):
    try:
      options = [df.loc[i, "Option 1"], df.loc[i, "Option 2"], df.loc[i, "Option 3"], df.loc[i, "Option 4"], df.loc[i, "Option 5"]]
      query = df.loc[i, "Query"]
      answer = df.loc[i, "Correct Answer"]
    except:
      continue
      
    options_str = [str(i) for i in options]
    query_str = query.split(' ')
    
    doc_scores = []
    for option in options_str:
      score = 0
      # sum over query terms in each document
      for term in query_str:
        freq = option.count(term)
        if freq != 0:
          tf = 1 + math.log10(freq)
        else:
          tf = 0
        if term in doc_freq.keys():
          idf = math.log10(len(descriptions)/(doc_freq[term] + 1))
        else:
          idf = math.log10(len(descriptions))
        score += tf*idf
      doc_scores.append(score)

    # choose option that has highest similarity as correct answer
    doc_scores, options = shuffle(doc_scores, options, random_state=0)
    ind = np.argmax(doc_scores) 
    result = 0
    if (options[ind]) == answer:
      correct += 1
      result = 1
    results.append([result, "Query: " + str(df.loc[i, "Query"]), "Recommended: " + str(options[ind])])

  print("Results for {}:".format(name))
  print("Total correct answers: {} out of {}".format(correct, total))
  print(results)

  return correct, total

In [None]:
# Individual results

tfidf_score(K_df, "K")
tfidf_score(Y_df, "Y")
tfidf_score(H_df, "H")
tfidf_score(Z_df, "Z")
tfidf_score(N_df, "N")