# Originality Algo

## Algorithm to Automate Originality Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_cup = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Cup")
data_key = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Key")
data_rope = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Rope")
data_brick = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Brick")
data_chair = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Chair")
data_pencil = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Pencil")
data_shoe = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Shoe")

# when on mac
# data_cup = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Cup", engine='openpyxl')
# data_key = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Key", engine='openpyxl')
# data_rope = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Rope", engine='openpyxl')
# data_brick = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Brick", engine='openpyxl')
# data_chair = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Chair", engine='openpyxl')
# data_pencil = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Pencil", engine='openpyxl')
# data_shoe = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Shoe", engine='openpyxl')

### Preprocessing

In [3]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [33]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list, stem = True):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub("wedging","wedge", text)
    text = re.sub("exersizing","exercising", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    if stem:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [34]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [35]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list, True)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

### Word2Vec Models for Embeddings

In [7]:
# load pretrained model
word_model_twitter25 = api.load("glove-twitter-25")

# on pc
# word_model_google = KeyedVectors.load_word2vec_format("C:/Users/jhec8/Documents/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

# on mac
# word_model = KeyedVectors.load_word2vec_format("/Users/johnhenrycruz/Desktop/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

In [8]:
# create dictionary of counts for each word in model
twitter25_dict = {}
for i in range(len(word_model_twitter25)):
    twitter25_dict[word_model_twitter25.index_to_key[i]] = word_model_twitter25.key_to_index[word_model_twitter25.index_to_key[i]]

In [9]:
# get the frequency of each word in dictionary
total_words = 0
for key in twitter25_dict:
    total_words = total_words + twitter25_dict[key]
    
for key in twitter25_dict:
    twitter25_dict[key] = twitter25_dict[key]/total_words

## Originality Algo 1
### Term Frequency Only

In [36]:
def get_tf_dict(responses):
    cv = CountVectorizer()   
    cv_fit = cv.fit_transform(responses)    
    word_list = cv.get_feature_names()

    # [0] here to get a 1d-array for iteration by the zip function 
    count_list = np.asarray(cv_fit.sum(axis=0))[0]

    tf_dict = dict(zip(word_list, count_list))
    
    total_words = 0
    for key in tf_dict:
        total_words = total_words + tf_dict[key]
        
    for key in tf_dict:
        tf_dict[key] = np.log(1 + tf_dict[key]/total_words)
        
    tf_dict = dict(sorted(tf_dict.items(), key=lambda item: item[1], reverse=True))
                
    return tf_dict

In [37]:
def get_tf_sum(tf_dict, response):
    tf_sum = 0
    for term in range(len(response)):
        tf_sum = tf_sum + tf_dict[response[term]]
        
    if tf_sum == 0:
        tf_sum = np.nan
        
    return tf_sum

In [38]:
def get_tf_sum_list(tf_dict, responses):
    tf_sum_list = []
    
    for response in responses:
        tf_sum_list.append(get_tf_sum(tf_dict, response))
            
    return tf_sum_list

In [59]:
def get_originality_tf_only(df, stopwords_list, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = originality_rating_df['response_processed'].tolist()
    responses_tokenized = [item for sublist in responses for item in sublist]

    tf_dict = get_tf_dict(responses_tokenized)
    
    tf_sum_list = get_tf_sum_list(tf_dict, responses)

    originality_rating_df['tf_sum'] = tf_sum_list
        
    return originality_rating_df

In [60]:
get_originality_tf_only(data_brick, stopwords_spacy, True, False)

Unnamed: 0,id,stim,response,response_processed,tf_sum
0,1476,Brick,as a foundation,[foundation],0.010929
1,1476,Brick,as a weapon,[weapon],0.063851
2,1476,Brick,as a bowl,[bowl],0.010929
3,1718,Brick,build a house,"[build, house]",0.147799
4,1718,Brick,break a window,"[break, window]",0.064757
5,1718,Brick,line a fireplace,"[line, fireplace]",0.032669
6,1718,Brick,line a road,"[line, road]",0.04348
7,1691,Brick,build house,"[build, house]",0.147799
8,1691,Brick,break window,"[break, window]",0.064757
9,1691,Brick,weight for workout,"[weight, workout]",0.064418


## Originality Algo 2
### tf-idf scikit-learn
### participant as document

In [41]:
def tfidf_scikit_learn(cleaned_responses):
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(cleaned_responses)
    df = pd.DataFrame(tfIdf.toarray(), columns=tfIdfVectorizer.get_feature_names())
#     df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
#     df = df.sort_values('TF-IDF', ascending=False)
    display(df)
    
#     feature_names = tfIdfVectorizer.get_feature_names()
#     for col in tfIdf.nonzero()[1]:
#         print (feature_names[col], ' - ', tfIdf[0, col])

    print(tfIdf)

In [42]:
def get_originality_tfidf_scikit_learn(df, stopwords_list, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = []
    
    id_list = get_id_list(df)
    
    for participant in id_list:
        temp_df = originality_rating_df.loc[originality_rating_df['id'] == participant]
        temp_list = temp_df['response_processed'].tolist()
        temp_list = ' '.join(temp_list)
        responses.append(temp_list)
        
    print(responses)
    
    tfidf_scikit_learn(responses)   

In [43]:
get_originality_tfidf_scikit_learn(data_brick, stopwords_spacy, True, True)

['foundation weapon bowl', 'house fire pit weapon break sidewalk paint', 'build hit people paperweight', 'build house break window weight workout weight holding wind karate', 'build house use paperweight break prop martial art use step stool', 'build house break window line fireplace line road', 'throw thing smash thing paper weight weight lifting', 'door stop weapon canvas', 'build use weapon throw', 'build throw demolish paperweight', 'build use weapon paperweight anchor small boat combination rope obviously', 'build house hold open door road carve name cooking', 'build thing thrown weapon weight hard surface like anvil']


Unnamed: 0,anchor,anvil,art,boat,bowl,break,build,canvas,carve,combination,...,surface,thing,throw,thrown,use,weapon,weight,wind,window,workout
0,0.0,0.0,0.0,0.0,0.655075,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.376501,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.302181,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.252086,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.277098,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.241237,0.158851,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.535518,0.350145,0.301952,0.350145
4,0.0,0.0,0.335437,0.0,0.0,0.231103,0.152178,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.513023,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.250556,0.164987,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.313617,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.578176,0.256351,0.0,0.0,0.0,0.512703,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.570358,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.327811,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.347367,0.0,0.0,0.0,...,0.0,0.0,0.585523,0.0,0.585523,0.440071,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.301426,0.0,0.0,0.0,...,0.0,0.0,0.508085,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  (0, 4)	0.655075136305677
  (0, 48)	0.376501170766038
  (0, 15)	0.655075136305677
  (1, 29)	0.43860410751211154
  (1, 37)	0.43860410751211154
  (1, 5)	0.3021814542943412
  (1, 33)	0.43860410751211154
  (1, 13)	0.43860410751211154
  (1, 20)	0.2750363685138143
  (1, 48)	0.2520855255053467
  (2, 31)	0.4208122865191767
  (2, 32)	0.6107919421788647
  (2, 17)	0.6107919421788647
  (2, 6)	0.2770982249189687
  (3, 21)	0.35014528605914114
  (3, 50)	0.35014528605914114
  (3, 19)	0.35014528605914114
  (3, 52)	0.35014528605914114
  (3, 49)	0.535518256916423
  (3, 51)	0.301952473284094
  (3, 6)	0.15885055209572455
  (3, 5)	0.2412367096966542
  (3, 20)	0.21956631568316437
  (4, 41)	0.335436591890706
  (4, 40)	0.335436591890706
  :	:
  (10, 3)	0.3628432626653525
  (10, 38)	0.3628432626653525
  (10, 0)	0.3628432626653525
  (10, 47)	0.277469380986608
  (10, 31)	0.24998512990463073
  (10, 6)	0.1646112482258863
  (10, 48)	0.2085423573981102
  (11, 10)	0.37565386534262213
  (11, 26)	0.37565386534262213
  

## Originality Algo 3
### tf-idf scikit-learn + clustering
### participant as document


### Potential for Flexibility Algo

In [44]:
def tfidf_scikit_learn_clustering(num_clusters, cleaned_responses):
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf = tfidf_vectorizer.fit_transform(cleaned_responses)
#     df = pd.DataFrame(tfIdf.toarray(), columns=tfIdfVectorizer.get_feature_names())
#     df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
#     df = df.sort_values('TF-IDF', ascending=False)
#     print (df)
    
    number_of_clusters = num_clusters
    kmeans = KMeans(n_clusters = num_clusters).fit(tfidf)
    
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names()
    for i in range(number_of_clusters):
        top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
        print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
        
    results = pd.DataFrame()
    results['text'] = cleaned_responses
    results['category'] = kmeans.labels_
    
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    test = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    display(test)

In [45]:
def get_originality_tfidf_scikit_learn_clustering(df, stopwords_list, num_clusters, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses_split = originality_rating_df['response_processed'].tolist()
    responses = []
    
    id_list = get_id_list(df)
    
    for participant in id_list:
        temp_df = originality_rating_df.loc[originality_rating_df['id'] == participant]
        temp_list = temp_df['response_processed'].tolist()
        temp_list = ' '.join(temp_list)
        responses.append(temp_list)
        
    print(responses)
    
    tfidf_scikit_learn_clustering(num_clusters, responses_split)   

In [46]:
get_originality_tfidf_scikit_learn_clustering(data_brick, stopwords_spacy, 15, True, True)

['foundation weapon bowl', 'house fire pit weapon break sidewalk paint', 'build hit people paperweight', 'build house break window weight workout weight holding wind karate', 'build house use paperweight break prop martial art use step stool', 'build house break window line fireplace line road', 'throw thing smash thing paper weight weight lifting', 'door stop weapon canvas', 'build use weapon throw', 'build throw demolish paperweight', 'build use weapon paperweight anchor small boat combination rope obviously', 'build house hold open door road carve name cooking', 'build thing thrown weapon weight hard surface like anvil']
Cluster 0: name fire people pit carve
Cluster 1: house build workout fire like
Cluster 2: sidewalk karate paint bowl foundation
Cluster 3: paperweight use workout door lifting
Cluster 4: break window martial art prop
Cluster 5: weapon thrown workout door lifting
Cluster 6: weight workout lifting paper wind
Cluster 7: throw demolish thing workout door
Cluster 8: road

Unnamed: 0,category,responses
0,0,"[fire pit, hit people, carve name]"
1,1,"[build house, build house, house, build house,..."
2,2,"[foundation, bowl, line fireplace, karate, sid..."
3,3,"[paperweight, paperweight, use paperweight, pa..."
4,4,"[break window, break window, break, break prop..."
5,5,"[weapon, weapon, thrown weapon, weapon]"
6,6,"[weight workout, weight holding wind, weight, ..."
7,7,"[throw demolish, throw thing, throw]"
8,8,"[line road, road]"
9,9,[smash thing]


## Originality Algo 4
### Counter Vectorizer + clustering

In [61]:
def get_cv_freqs_list(df):
    z = 5
    # Work in Progress

In [97]:
def get_counts_vector(num_clusters, responses):
    count_vectorizer = CountVectorizer()
    word_count = count_vectorizer.fit_transform(responses)
#     df = pd.DataFrame(word_count.toarray(), columns=count_vectorizer.get_feature_names())
#     display(df)
    
    number_of_clusters = num_clusters
    kmeans = KMeans(n_clusters = num_clusters).fit(word_count)
    
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = count_vectorizer.get_feature_names()
    for i in range(number_of_clusters):
        top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
        print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
        
    results = pd.DataFrame()
    results['text'] = responses
    results['category'] = kmeans.labels_
    
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    test = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    display(test)
    
    return word_count

In [98]:
def get_originality_count_vectorizer(df, stopwords_list, num_clusters, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = originality_rating_df['response_processed'].tolist()
#     responses_tokenized = [item for sublist in responses for item in sublist]

    word_count = get_counts_vector(num_clusters, responses)
    
#     freqs_list = get_cv_freqs_list(tf_dict, responses)

#     originality_rating_df['counts_vectorizer'] = freqs_list
        
    return originality_rating_df

In [99]:
get_originality_count_vectorizer(data_brick, stopwords_spacy, 15, True, True)

Cluster 0: break window name cooking paint
Cluster 1: house build workout fire like
Cluster 2: weapon use thrown workout door
Cluster 3: paperweight use workout door lifting
Cluster 4: weight workout wind lifting holding
Cluster 5: people hit workout fire like
Cluster 6: anchor combination obviously rope small
Cluster 7: use stool step workout door
Cluster 8: build thing workout fire like
Cluster 9: door open hold fire like
Cluster 10: anvil like surface hard workout
Cluster 11: throw demolish door like lifting
Cluster 12: line road fireplace fire like
Cluster 13: art break prop martial workout
Cluster 14: thing throw smash workout door


Unnamed: 0,category,responses
0,0,"[foundation, bowl, break window, break window,..."
1,1,"[build house, build house, house, build house,..."
2,2,"[weapon, weapon, thrown weapon, use weapon, we..."
3,3,"[paperweight, paperweight, use paperweight, pa..."
4,4,"[weight workout, weight holding wind, weight, ..."
5,5,[hit people]
6,6,[anchor small boat combination rope obviously]
7,7,[use step stool]
8,8,"[build, build, build thing, build, build]"
9,9,[hold open door]


Unnamed: 0,id,stim,response,response_processed
0,1476,Brick,as a foundation,foundation
1,1476,Brick,as a weapon,weapon
2,1476,Brick,as a bowl,bowl
3,1718,Brick,build a house,build house
4,1718,Brick,break a window,break window
5,1718,Brick,line a fireplace,line fireplace
6,1718,Brick,line a road,line road
7,1691,Brick,build house,build house
8,1691,Brick,break window,break window
9,1691,Brick,weight for workout,weight workout


### Write Results into Excel Sheet

In [21]:
prompts_list = ['cup', 'key', 'rope', 'brick', 'chair', 'pencil', 'shoe']
data_list = [data_cup, data_key, data_rope, data_brick, data_chair, data_pencil, data_shoe]

In [22]:
# write results df of each dataset for specific methods
def write_results_excel(method, stopwords):
    # change this when writing new sheet
    writer = pd.ExcelWriter('originality_tf_only_results.xlsx', engine='xlsxwriter')

    for i in range(len(prompts_list)):
        df = method(data_list[i], stopwords)
        df.to_excel(writer, sheet_name = prompts_list[i], index = False)
    writer.save()

In [23]:
# write_results_excel(get_originality_tf_only, stopwords_spacy)

In [24]:
# write_results_excel(get_originality_tfidf_scikit_learn, stopwords_spacy)

In [25]:
# write_results_excel(get_originality_tfidf_scikit_learn_clustering, stopwords_spacy)

Algo Design Brainstorming:
* Algo Design w/o scikit-learn
    * asdlfjakl;fjka
    * asdfasf

* Algo Design w/ scikit-learn

* To Do List

