# Originality Algo

## Algorithm to Automate Originality Scoring

### Import Packages

In [162]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_cup = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Cup")
data_key = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Key")
data_rope = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Rope")
data_brick = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Brick")
data_chair = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Chair")
data_pencil = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Pencil")
data_shoe = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Shoe")

# when on mac
# data_cup = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Cup", engine='openpyxl')
# data_key = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Key", engine='openpyxl')
# data_rope = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Rope", engine='openpyxl')
# data_brick = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Brick", engine='openpyxl')
# data_chair = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Chair", engine='openpyxl')
# data_pencil = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Pencil", engine='openpyxl')
# data_shoe = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/autdata_test.xlsx", sheet_name = "Shoe", engine='openpyxl')

### Preprocessing

In [127]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [128]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list, stem = True):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub("wedging","wedge", text)
    text = re.sub("exersizing","exercising", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    tokens = [w.lower() for w in tokens]

    if stem:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [129]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [130]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list, True)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

### Word2Vec Models for Embeddings

In [7]:
# load pretrained model
word_model_twitter25 = api.load("glove-twitter-25")

# on pc
# word_model_google = KeyedVectors.load_word2vec_format("C:/Users/jhec8/Documents/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

# on mac
# word_model = KeyedVectors.load_word2vec_format("/Users/johnhenrycruz/Desktop/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

In [8]:
# create dictionary of counts for each word in model
twitter25_dict = {}
for i in range(len(word_model_twitter25)):
    twitter25_dict[word_model_twitter25.index_to_key[i]] = word_model_twitter25.key_to_index[word_model_twitter25.index_to_key[i]]

In [9]:
# get the frequency of each word in dictionary
total_words = 0
for key in twitter25_dict:
    total_words = total_words + twitter25_dict[key]
    
for key in twitter25_dict:
    twitter25_dict[key] = twitter25_dict[key]/total_words

## Originality Algo 1
### tf-idf scikit-learn

In [194]:
def tfidf_scikit_learn(cleaned_responses):
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(cleaned_responses)
#     df = pd.DataFrame(tfIdf.toarray(), columns=tfIdfVectorizer.get_feature_names())
#     df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
#     df = df.sort_values('TF-IDF', ascending=False)
#     print (df)
    
    number_of_clusters=2
    kmeans = KMeans(n_clusters=2).fit(tfIdf)
    
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = tfIdfVectorizer.get_feature_names()
    for i in range(number_of_clusters):
        top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
        print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))


In [195]:
def get_originality_tfidf_scikit_learn(df, stopwords_list, remove_sw, join_list):
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses = novel_rating_df['response_processed'].tolist()
#     responses = [item for sublist in responses for item in sublist]
    
    print(responses)
    
    tfidf_scikit_learn(responses)
    
   

In [196]:
get_originality_tfidf_scikit_learn(data_key, stopwords_spacy, True, True)

['turn lock', 'cut box', 'pendant', 'open door', 'cut tape box', 'stab', 'scratch car', 'door opener', 'knife', 'letter opener', 'chime', 'gift', 'unlock door', 'open soup', 'start car', 'wear necklace', 'open door', 'use decoration', 'paperweight', 'scratch lottery ticket', 'open', 'unlock door', 'scratch scratch ticket', 'cut open plastic', 'scratch someone car dont like', 'unlock door', 'scratch lottery ticket', 'key car', 'decoration', 'necklace', 'lock', 'necklace', 'bookmark', 'decoration', 'weapon', 'unlock', 'cut', 'throw', 'lock unlock door', 'pendant', 'weapon', 'cutting utensil', 'unlock door', 'open box', 'dog attention', 'poke', 'unlock', 'use art piece', 'jingle amuse baby', 'unlock door', 'scratch', 'fidget', 'distract cat', 'unlock lock', 'cut', 'comb', 'break', 'open door', 'stab', 'pry open tab tough', 'carve art surface', 'slash someone tire', 'opening door', 'weapon', 'bottle opener', 'currency', 'opening chest', 'melt scrap metal', 'unlock door', 'start car', 'cut 

### Write Results into Excel Sheet

In [10]:
prompts_list = ['cup', 'key', 'rope', 'brick', 'chair', 'pencil', 'shoe']
data_list = [data_cup, data_key, data_rope, data_brick, data_chair, data_pencil, data_shoe]

In [11]:
# write results df of each dataset for specific methods
def write_results_excel(method, stopwords, model):
    # change this when writing new sheet
    writer = pd.ExcelWriter('novelty_algo_2_results.xlsx', engine='xlsxwriter')

    for i in range(len(prompts_list)):
        df = method(data_list[i], prompts_list[i], stopwords, model)
        df.to_excel(writer, sheet_name = prompts_list[i], index = False)
    writer.save()

In [12]:
# write_results_excel(get_novelty_word2vec_avg, stopwords_spacy, word_model_twitter25)

In [13]:
# write_results_excel(get_novelty_word2vec_sif_cosinesim, stopwords_spacy, word_model_twitter25)

Algo Design Brainstorming:
* Algo Design w/o scikit-learn
    * asdlfjakl;fjka
    * asdfasf

* Algo Design w/ scikit-learn

* To Do List

