# Originality Methods

## Algorithm to Automate Originality Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re

from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

import shared_functions as sf
from shared_functions import *

## Originality Algo 
### Counter Vectorizer + clustering

In [2]:
# calculate originality score for a phrase based on number of responses in a cluster
# calculate the rarity of a phrase
def get_clustered_originality_score(originality_rating_df, num_clusters, responses, display_clusters):
    # get cluster df
    clusters_df = sf.get_counts_vector(num_clusters, responses, display_clusters)
    # create dictionary out of cluster df
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
        
    # initialize empty dictionary to store the score for a category
    clusters_scores = dict.fromkeys(clusters)
    
    # initialize empty dictionary to store 
    # will store 0 or 1, 1 if only 1 response in cluster
    clusters_rarity = dict.fromkeys(clusters)
    
    # get the average cosine distance for a cluster
    for key in clusters:
        score = 1.0
        score = score - (len(clusters[key])/len(originality_rating_df.index))
        clusters_scores[key] = score
        if (len(clusters[key]) == 1):
            clusters_rarity[key] = 1
        else:
            clusters_rarity[key] = 0
        
    # create dictionary to store a phrase and its new originality score 
    # new score is the average of the responses in one cluster
    phrase_scores_dict = {}
    # create dictionary to store a phrase and its rarity 
    # will be 0 or 1, 1 if only response in that cluster
    phrase_rarity_dict = {}
    for key in clusters:
        for phrase in clusters[key]:
            phrase_scores_dict[phrase] = clusters_scores[key]
            phrase_rarity_dict[phrase] = clusters_rarity[key]
            
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_scores_list = [] 
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_rarity_list = []
    for phrase in originality_rating_df['response_processed'].tolist():
        df_phrases_scores_list.append(phrase_scores_dict[phrase])
        df_phrases_rarity_list.append(phrase_rarity_dict[phrase])
                    
    # return the two parallel list of the inverse mapping of the cluster frequency
    # and the rarity marking (1 or 0)
    return (df_phrases_scores_list, df_phrases_rarity_list)

In [3]:
# create df of each participants originality count score, the amount of 
def get_originality_score_df(originality_df):
    # get id list and create empty dict
    id_list = get_id_list(originality_df)
    participants_originality = {k: 0 for k in id_list}
    # add the number of unique responses aka clusters with only 1 response
    for participant in id_list:
        id_df = originality_df[originality_df.id == participant]
        participants_originality[participant] = id_df['originality'].sum()
    
    # return counts df by participant
    return pd.DataFrame(participants_originality.items(), columns=['id', 'originality'])

In [4]:
# calculate the originality freq and counts for a response and participant respectively
def get_originality_count_vectorizer(df, stopwords_list, num_clusters, join_list, display_clusters):
    # clean the dataframe
    originality_rating_df = sf.get_cleaned_responses_df(df, stopwords_list, join_list)
    responses = originality_rating_df['response_processed'].tolist()
            
    originality_results = get_clustered_originality_score(originality_rating_df, num_clusters, responses, display_clusters)
    # add frequency column to results df
    originality_rating_df['cluster_freq'] = originality_results[0]
    # add 1/0 column to show if that phrase was by itself in a cluster
    originality_rating_df['originality'] = originality_results[1]
    
    # add the transformed values of the cluster frequency as a column
    originality_rating_df['freq'] = (1 - originality_rating_df['cluster_freq'])
    originality_rating_df['t_freq'] = (.05/(.05 + originality_rating_df['freq']))**2
    
    # get the original phrases counts df
    originality_scores = get_originality_score_df(originality_rating_df)

    # return tuple of freq and count results respectively
    return (originality_rating_df, originality_scores)

## Collect the Method Results

In [5]:
# calculate originality scores from method
def get_originality_scores(data_dict, num_clusters):
    # list to store the originality results
    results_list = []
    # list of the keys in the data dictionary passed in
    data_keys = list(data_dict.keys())
    for data in data_keys:
        # get originality results for each dataset
        results = get_originality_count_vectorizer(data_dict[data], sf.stopwords_edited, num_clusters, True, False)
        # add originality results to a list
        results_list.append(results)
        
    # return list
    return results_list

## Write Originality Results into CSVs

In [6]:
underscore = "_"

In [7]:
# write originality freq results into csvs
def write_originality_results_freq(data_dict, results, date):
    # get list of prompts from the data_dict
    data_keys = list(data_dict.keys())
    # iterate through the results list, write out the corresponding freqs table
    for i in range(len(data_keys)):
        results[i][0].to_csv("originality_results_" + date + underscore + "freqs" + underscore + data_keys[i] + ".csv", encoding = 'utf-8', index=False)      

In [8]:
# write originality counts results into csvs
def write_originality_results_counts(data_dict, results, date):
    # get list of prompts from the data_dict
    data_keys = list(data_dict.keys())
    # iterate through the results list, write out the corresponding counts table
    for i in range(len(data_keys)):
        results[i][1].to_csv("originality_results_" + date + underscore + "counts" + underscore + data_keys[i] + ".csv", encoding = 'utf-8', index=False)    

## Example Code

In [9]:
import os
from os.path import expanduser
import glob
cwd = os.getcwd()

In [26]:
# read in the official novelty results
originality_dict = {}
for filename in glob.glob(cwd + '/..//data/novelty/official/official_csvs/*.csv'):
    originality_dict[filename[103:-4]] = pd.read_csv(filename)

In [27]:
list(originality_dict.keys())

['autdata_official_box',
 'autdata_official_brick',
 'autdata_official_chair',
 'autdata_official_cup',
 'autdata_official_key',
 'autdata_official_pencil',
 'autdata_official_rope',
 'autdata_official_shoe']

In [28]:
originality_dict['autdata_official_box']

Unnamed: 0,id,item,response,novelty_1,novelty_2
0,1094,Box,to store things,1,1
1,1094,Box,to build a fort,3,4
2,1094,Box,as an umbrella,3,4
3,1094,Box,to catch a bunny,3,4
4,1093,Box,standing on,2,2
...,...,...,...,...,...
443,1525,Box,put things in it,1,1
444,1525,Box,cut it open and use it as a floor mat,2,4
445,1525,Box,use it as a support to put things on,2,3
446,1525,Box,let a cat play in it,3,3


In [30]:
get_originality_scores(originality_dict, 80)

[(       id item                               response  novelty_1  novelty_2  \
  0    1094  Box                        to store things          1          1   
  1    1094  Box                        to build a fort          3          4   
  2    1094  Box                         as an umbrella          3          4   
  3    1094  Box                       to catch a bunny          3          4   
  4    1093  Box                            standing on          2          2   
  ..    ...  ...                                    ...        ...        ...   
  442  1372  Box          trap unwanted bugs or animals          3          3   
  444  1525  Box  cut it open and use it as a floor mat          2          4   
  445  1525  Box   use it as a support to put things on          2          3   
  446  1525  Box                   let a cat play in it          3          3   
  447  1525  Box                          return things          2          1   
  
             response_pro

In [31]:
results = Out[30]

In [35]:
results[0][0]

Unnamed: 0,id,item,response,novelty_1,novelty_2,response_processed,cluster_freq,originality,freq,t_freq
0,1094,Box,to store things,1,1,store,0.961451,0,0.038549,0.318842
1,1094,Box,to build a fort,3,4,build fort,0.977324,0,0.022676,0.473327
2,1094,Box,as an umbrella,3,4,umbrella,0.732426,0,0.267574,0.024789
3,1094,Box,to catch a bunny,3,4,catch bunny,0.997732,1,0.002268,0.915114
4,1093,Box,standing on,2,2,standing,0.732426,0,0.267574,0.024789
...,...,...,...,...,...,...,...,...,...,...
442,1372,Box,trap unwanted bugs or animals,3,3,trap unwanted bug animal,0.995465,0,0.004535,0.840595
444,1525,Box,cut it open and use it as a floor mat,2,4,cut open floor mat,0.732426,0,0.267574,0.024789
445,1525,Box,use it as a support to put things on,2,3,support,0.993197,0,0.006803,0.774822
446,1525,Box,let a cat play in it,3,3,let cat play,0.970522,0,0.029478,0.395768


In [36]:
get_originality_count_vectorizer(originality_dict['autdata_official_box'], sf.stopwords_edited, 80, True, False)

(       id item                               response  novelty_1  novelty_2  \
 0    1094  Box                        to store things          1          1   
 1    1094  Box                        to build a fort          3          4   
 2    1094  Box                         as an umbrella          3          4   
 3    1094  Box                       to catch a bunny          3          4   
 4    1093  Box                            standing on          2          2   
 ..    ...  ...                                    ...        ...        ...   
 442  1372  Box          trap unwanted bugs or animals          3          3   
 444  1525  Box  cut it open and use it as a floor mat          2          4   
 445  1525  Box   use it as a support to put things on          2          3   
 446  1525  Box                   let a cat play in it          3          3   
 447  1525  Box                          return things          2          1   
 
            response_processed  cluste