# Originality Algo

## Algorithm to Automate Originality Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_cup = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_cup_semdis.csv")
data_key = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_key_semdis.csv")
data_rope = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_rope_semdis.csv")
data_brick = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_brick_semdis.csv")
data_chair = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_chair_semdis.csv")
data_pencil = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_pencil_semdis.csv")
data_shoe = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_shoe_semdis.csv")

# when on mac
# data_cup = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_cup_semdis.csv.xlsx")
# data_key = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_key_semdis.csv.xlsx")
# data_rope = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_rope_semdis.csv.xlsx")
# data_brick = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_brick_semdis.csv.xlsx")
# data_chair = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_chair_semdis.csv.xlsx")
# data_pencil = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_pencil_semdis.csv.xlsx")
# data_shoe = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test_shoe_semdis.csv.xlsx")

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test_cup_semdis.csv'

### Preprocessing

In [None]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [None]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub("wedging","wedge", text)
    text = re.sub("exersizing","exercising", text)
    text = re.sub("thrown","throw", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [None]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [None]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

### Word2Vec Models for Embeddings

In [None]:
# load pretrained model
word_model_twitter25 = api.load("glove-twitter-25")

# on pc
# word_model_google = KeyedVectors.load_word2vec_format("C:/Users/jhec8/Documents/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

# on mac
# word_model = KeyedVectors.load_word2vec_format("/Users/johnhenrycruz/Desktop/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
# create dictionary of counts for each word in model
twitter25_dict = {}
for i in range(len(word_model_twitter25)):
    twitter25_dict[word_model_twitter25.index_to_key[i]] = word_model_twitter25.key_to_index[word_model_twitter25.index_to_key[i]]

In [None]:
# get the frequency of each word in dictionary
total_words = 0
for key in twitter25_dict:
    total_words = total_words + twitter25_dict[key]
    
for key in twitter25_dict:
    twitter25_dict[key] = twitter25_dict[key]/total_words

## Originality Algo 1
### Term Frequency Only

In [None]:
def get_tf_dict(responses):
    cv = CountVectorizer()   
    cv_fit = cv.fit_transform(responses)    
    word_list = cv.get_feature_names()

    # [0] here to get a 1d-array for iteration by the zip function 
    count_list = np.asarray(cv_fit.sum(axis=0))[0]

    tf_dict = dict(zip(word_list, count_list))
    
    total_words = 0
    for key in tf_dict:
        total_words = total_words + tf_dict[key]
        
    for key in tf_dict:
        tf_dict[key] = np.log(1 + tf_dict[key]/total_words)
        
    tf_dict = dict(sorted(tf_dict.items(), key=lambda item: item[1], reverse=True))
                
    return tf_dict

In [None]:
def get_tf_sum(tf_dict, response):
    tf_sum = 0
    for term in range(len(response)):
        tf_sum = tf_sum + tf_dict[response[term]]
        
    if tf_sum == 0:
        tf_sum = np.nan
        
    return tf_sum

In [None]:
def get_tf_sum_list(tf_dict, responses):
    tf_sum_list = []
    
    for response in responses:
        tf_sum_list.append(get_tf_sum(tf_dict, response))
            
    return tf_sum_list

In [None]:
def get_originality_tf_only(df, stopwords_list, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = originality_rating_df['response_processed'].tolist()
    responses_tokenized = [item for sublist in responses for item in sublist]

    tf_dict = get_tf_dict(responses_tokenized)
    
    tf_sum_list = get_tf_sum_list(tf_dict, responses)

    originality_rating_df['tf_sum'] = tf_sum_list
        
    return originality_rating_df

In [None]:
get_originality_tf_only(data_brick, stopwords_spacy, True, False)

## Originality Algo 2
### tf-idf scikit-learn
### participant as document

In [None]:
def tfidf_scikit_learn(cleaned_responses):
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(cleaned_responses)
    df = pd.DataFrame(tfIdf.toarray(), columns=tfIdfVectorizer.get_feature_names())
#     df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
#     df = df.sort_values('TF-IDF', ascending=False)
    display(df)
    
#     feature_names = tfIdfVectorizer.get_feature_names()
#     for col in tfIdf.nonzero()[1]:
#         print (feature_names[col], ' - ', tfIdf[0, col])

    print(tfIdf)

In [None]:
def get_originality_tfidf_scikit_learn(df, stopwords_list, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = []
    
    id_list = get_id_list(df)
    
    for participant in id_list:
        temp_df = originality_rating_df.loc[originality_rating_df['id'] == participant]
        temp_list = temp_df['response_processed'].tolist()
        temp_list = ' '.join(temp_list)
        responses.append(temp_list)
        
    print(responses)
    
    tfidf_scikit_learn(responses)   

In [None]:
get_originality_tfidf_scikit_learn(data_brick, stopwords_spacy, True, True)

## Originality Algo 3
### tf-idf scikit-learn + clustering
### participant as document


### Potential for Flexibility Algo

In [None]:
def tfidf_scikit_learn_clustering(num_clusters, cleaned_responses):
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf = tfidf_vectorizer.fit_transform(cleaned_responses)
#     df = pd.DataFrame(tfIdf.toarray(), columns=tfIdfVectorizer.get_feature_names())
#     df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
#     df = df.sort_values('TF-IDF', ascending=False)
#     print (df)
    
    number_of_clusters = num_clusters
    kmeans = KMeans(n_clusters = num_clusters).fit(tfidf)
    
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names()
    for i in range(number_of_clusters):
        top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
        print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
        
    results = pd.DataFrame()
    results['text'] = cleaned_responses
    results['category'] = kmeans.labels_
    
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    test = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    display(test)

In [None]:
def get_originality_tfidf_scikit_learn_clustering(df, stopwords_list, num_clusters, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses_split = originality_rating_df['response_processed'].tolist()
    responses = []
    
    id_list = get_id_list(df)
    
    for participant in id_list:
        temp_df = originality_rating_df.loc[originality_rating_df['id'] == participant]
        temp_list = temp_df['response_processed'].tolist()
        temp_list = ' '.join(temp_list)
        responses.append(temp_list)
        
    display(originality_rating_df)
    
    tfidf_scikit_learn_clustering(num_clusters, responses_split)   

In [None]:
get_originality_tfidf_scikit_learn_clustering(data_brick, stopwords_spacy, 15, True, True)

## Originality Algo 4
### Counter Vectorizer + clustering

In [None]:
def get_cv_freqs_list(df):
    z = 5
    # Work in Progress

In [None]:
def get_counts_vector(num_clusters, responses):
    # initialize CountVectorizer object
    count_vectorizer = CountVectorizer()
    # vectorize the phrases
    word_count = count_vectorizer.fit_transform(responses)
    
    # elbow method to visualize and find out how many clusters to use
    visualizer = KElbowVisualizer(KMeans(), k=(10,35), timings=False)
    visualizer.fit(word_count.toarray())       
#     visualizer.show()

    # nltk kmeans cosine distance implementation
    number_of_clusters = num_clusters
    kmeans = KMeansClusterer(number_of_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)
    assigned_clusters = kmeans.cluster(word_count.toarray(), assign_clusters=True)

    # scikit-learn euclidean distance implementation
#     kmeans = KMeans(n_clusters = num_clusters).fit(word_count)
        
    # cluster results scikit-learn
    results = pd.DataFrame()
    results['text'] = responses
#     results['category'] = kmeans.labels_
    results['category'] = assigned_clusters
    
    # create dictionary to organize the clusters with their respective phrases
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    # df of the clusters and the 
    clusters_df = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    
    return clusters_df

In [None]:
def get_clustered_originality_score(originality_rating_df, column):
    # get cluster df
    clusters_df = get_counts_vector(22, originality_rating_df['response_processed'].tolist())

    # create dictionary out of cluster df
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
        
    # initialize empty dictionary to store the score for a category
    clusters_scores = dict.fromkeys(clusters)
    
    # get the average cosine distance for a cluster
    for key in clusters:
        score = 1.0
        score = score - (len(clusters[key])/len(originality_rating_df.index))
        clusters_scores[key] = score
        
    # create dictionary to store a phrase and its new novelty score 
    # new score is the average of the responses in one cluster
    phrase_scores_dict = {}
    for key in clusters:
        for phrase in clusters[key]:
            phrase_scores_dict[phrase] = clusters_scores[key]
            
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_scores_list = [] 
    for phrase in originality_rating_df['response_processed'].tolist():
        df_phrases_scores_list.append(phrase_scores_dict[phrase])
    
    display(clusters_df)
            
    return list(df_phrases_scores_list)

In [None]:
def get_originality_count_vectorizer(df, stopwords_list, remove_sw, join_list):
    originality_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = originality_rating_df['response_processed'].tolist()
    originality_rating_df = originality_rating_df[originality_rating_df.astype(str)['response_processed'] != '']
    
#     responses_tokenized = [item for sublist in responses for item in sublist]
    
    originality_rating_df['counts_vectorizer_freq'] = get_clustered_originality_score(originality_rating_df, responses)
        
    return originality_rating_df

In [None]:
get_originality_count_vectorizer(data_brick, stopwords_spacy, True, True)

### Write Results into Excel Sheet

In [None]:
prompts_list = ['cup', 'key', 'rope', 'brick', 'chair', 'pencil', 'shoe']
data_list = [data_cup, data_key, data_rope, data_brick, data_chair, data_pencil, data_shoe]

In [None]:
# write results df of each dataset for specific methods
def write_results_excel(method, stopwords, remove_sw, join_list):
    # change this when writing new sheet
    writer = pd.ExcelWriter('originality_cv_freq_cluster_results.xlsx', engine='xlsxwriter')

    for i in range(len(prompts_list)):
        df = method(data_list[i], stopwords, remove_sw, join_list)
        df.to_excel(writer, sheet_name = prompts_list[i], index = False)
    writer.save()

In [None]:
# write_results_excel(get_originality_count_vectorizer, stopwords_spacy, True, True)

Algo Design Brainstorming:
* Algo Design w/o scikit-learn
    * asdlfjakl;fjka
    * asdfasf

* Algo Design w/ scikit-learn

To Do List
- [x] write basic algorithm, sum of term frequency
    - frequency based on all phrases
- [x] write tfidf clustering method 
    - repurpose for flexibility
- [x] write counter vectorizer clustering method
- [x] write method to score responses by frequency
    - number of responses in a cluster/number of total responses subtracted from 1
    - subtract from 1 to get the reverse mapping
- [ ] experiment with a greater cluster size
- [ ] update stop words list to include "use" and "thing"
- [ ] figure out how many times to run kmeans
    - cross validation
    - then averaging the results of all the iterations
