# Flexibility Algo

## Algorithm to Automate Flexibility Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_test_cup = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_cup_semdis.csv")
data_test_key = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_key_semdis.csv")
data_test_rope = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_rope_semdis.csv")
data_test_brick = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_brick_semdis.csv")
data_test_chair = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_chair_semdis.csv")
data_test_pencil = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_pencil_semdis.csv")
data_test_shoe = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_shoe_semdis.csv")

# when on mac
# data_test_cup = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_cup_semdis.csv.xlsx")
# data_test_key = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_key_semdis.csv.xlsx")
# data_test_rope = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_rope_semdis.csv.xlsx")
# data_test_brick = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_brick_semdis.csv.xlsx")
# data_test_chair = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_chair_semdis.csv.xlsx")
# data_test_pencil = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_pencil_semdis.csv.xlsx")
# data_test_shoe = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/test/semdis/autdata_test_shoe_semdis.csv.xlsx")

### Preprocessing

In [3]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [4]:
stopwords_edited = list(stopwords_spacy)
stopwords_edited.append("thing")
stopwords_edited.append("things")
stopwords_edited.append("use")

In [5]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub("wedging","wedge", text)
    text = re.sub("exersizing","exercising", text)
    text = re.sub("thrown","throw", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [6]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [7]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

### Word2Vec Models for Embeddings

In [8]:
# load pretrained model
word_model_twitter25 = api.load("glove-twitter-25")

# on pc
# word_model_google = KeyedVectors.load_word2vec_format("C:/Users/jhec8/Documents/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

# on mac
# word_model = KeyedVectors.load_word2vec_format("/Users/johnhenrycruz/Desktop/Northwestern_SROP/GoogleNews-vectors-negative300.bin", binary=True)

In [9]:
# create dictionary of counts for each word in model
twitter25_dict = {}
for i in range(len(word_model_twitter25)):
    twitter25_dict[word_model_twitter25.index_to_key[i]] = word_model_twitter25.key_to_index[word_model_twitter25.index_to_key[i]]

In [10]:
# get the frequency of each word in dictionary
total_words = 0
for key in twitter25_dict:
    total_words = total_words + twitter25_dict[key]
    
for key in twitter25_dict:
    twitter25_dict[key] = twitter25_dict[key]/total_words

## Flexibility Algo 1
### tf-idf scikit-learn + clustering
### ___ as document

In [249]:
def get_tfidf_vector(num_clusters, responses):
    # initialize CountVectorizer object
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    # vectorize the phrases
    tfidf = tfidf_vectorizer.fit_transform(responses)
    
    # elbow method to visualize and find out how many clusters to use
#     visualizer = KElbowVisualizer(KMeans(), k=(1,12), timings=False)
#     visualizer.fit(tfidf.toarray())       
#     visualizer.show()

    # nltk kmeans cosine distance implementation
    number_of_clusters = num_clusters
    kmeans = KMeansClusterer(number_of_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)
#     print(tfidf.toarray().reshape(450,283).tolist())
    assigned_clusters = kmeans.cluster(tfidf.toarray(), assign_clusters=True)
#     print("HERE")
#     scikit-learn euclidean distance implementation
#     kmeans = KMeans(n_clusters = num_clusters).fit(tfidf)
        
    # cluster results scikit-learn
    results = pd.DataFrame()
    results['text'] = responses
#     results['category'] = kmeans.labels_
    results['category'] = assigned_clusters
    
    # create dictionary to organize the clusters with their respective phrases
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    # df of the clusters and the 
    clusters_df = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    
    # uncomment to see clusters 
#     print(results_dict)
#     display(clusters_df)
    
    return clusters_df

In [250]:
def get_flexibility_score(flexibility_rating_df, num_clusters, responses):
    clusters_df = get_tfidf_vector(num_clusters, responses)
    
    # create dictionary out of cluster df
    # has clusters and their respective responses
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
    
    flex_df_cleaned = flexibility_rating_df[flexibility_rating_df.response_processed != '']
    participants = get_id_list(flexibility_rating_df)
    participants_responses_list = list(zip(flex_df_cleaned.id, flex_df_cleaned.response_processed))
    
    # get dictionary of each participants responses
    participants_responses_dict = {k: [] for k in participants}
    
    for index in range(len(participants_responses_list)):
        participants_responses_dict[participants_responses_list[index][0]].append(participants_responses_list[index][1])
        
    # get dictionary of responses and their respective dictionary
    responses_cluster_rep = {}
    
    for key in clusters:
        for phrase in clusters[key]:
            responses_cluster_rep[phrase] = key
            
    # get dictionary of participants and clusters their responses existed in
    participants_clusters_apperance = {k: [] for k in participants}
    
    for index in range(len(participants_responses_list)):
        participants_clusters_apperance[participants_responses_list[index][0]].append(responses_cluster_rep[participants_responses_list[index][1]])
        
    # get dic of number of clusters a participants responses are in
    
    participants_clusters_seen = {k: [] for k in participants}
    
    for participant in participants_clusters_seen:
        responses_set = set(participants_clusters_apperance[participant])
        participants_clusters_seen[participant] = len(responses_set)
    
#     print(clusters)
#     print()
#     print(participants_responses_list)
#     print()
#     print(participants_responses_dict)
#     print()
#     print(responses_cluster_rep)
#     print()
#     print(participants_clusters_apperance)
#     print()
#     print(participants_clusters_seen)
    
    flexibility_df = pd.DataFrame(participants_clusters_seen.items(), columns=['id', 'flexibility'])
    return flexibility_df

In [251]:
def get_flexibility_tfidf_scikit_learn_clustering(df, stopwords_list, num_clusters, remove_sw, join_list):
    flexibility_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses_split = flexibility_rating_df['response_processed'].tolist()
    responses_split = [word for word in responses_split if word != '']
    responses = []
    
    id_list = get_id_list(df)
    
    for participant in id_list:
        temp_df = flexibility_rating_df.loc[flexibility_rating_df['id'] == participant]
        temp_list = temp_df['response_processed'].tolist()
        temp_list = ' '.join(temp_list)
        responses.append(temp_list)
                
    flexibility_rating_df = get_flexibility_score(flexibility_rating_df, num_clusters, responses_split)
        
    return flexibility_rating_df

In [254]:
get_flexibility_tfidf_scikit_learn_clustering(data_official_brick, stopwords_edited, 15, True, True)

Unnamed: 0,id,flexibility
0,1087,4
1,1093,4
2,1094,5
3,1102,2
4,1104,2
...,...,...
85,1603,3
86,1610,5
87,1614,2
88,1621,5


In [None]:
data_official_brick['response'].tolist()

## Comparing Algo Results with Human Ratings

In [43]:
# when on pc
data_official_cup = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Cup')
data_official_key = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Key')
data_official_rope = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Rope')
data_official_brick = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Brick')
data_official_chair = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Chair')
data_official_pencil = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Pencil')
data_official_shoe = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Shoe')
data_official_box = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Box')

# when on mac
# data_official_cup = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Cup')
# data_official_key = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Key')
# data_official_rope = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Rope')
# data_official_brick = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Brick')
# data_official_chair = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Chair')
# data_official_pencil = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Pencil')
# data_official_shoe = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Shoe')
# data_official_box = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/flexibility/official/autdata_flex_results.xlsx", sheet_name='Box')

In [44]:
prompts_list = ['box', 'brick', 'chair', 'cup', 'key', 'pencil', 'rope', 'shoe']
data_list = [data_official_box, data_official_brick, data_official_chair, data_official_cup, data_official_key, data_official_pencil, data_official_rope, data_official_shoe]

In [45]:
test = data_official_box[data_official_box.id == 1094]
test

Unnamed: 0,id,stim,response,Category
0,1094,Box,to store things,1.0
1,1094,Box,to build a fort,4.0
2,1094,Box,as an umbrella,1.0
3,1094,Box,to catch a bunny,4.0


In [46]:
len(test['Category'].unique())

2

In [232]:
def print_flexibility_corrs():
    flexibility_df_list = []
    for i in range(len(prompts_list)):
        print(prompts_list[i])
        id_list = get_id_list(data_list[i])
        participants_clusters_seen = {k: 0 for k in id_list}
        for participant in participants_clusters_seen:
            id_df = data_list[i][data_list[i].id == participant]
            cluster_apperance = len(id_df['Category'].unique())
            participants_clusters_seen[participant] = cluster_apperance
        flexibility_df_rater = pd.DataFrame(participants_clusters_seen.items(), columns=['id', 'rating'])
        flexibility_df_method = get_flexibility_tfidf_scikit_learn_clustering(data_list[i], stopwords_edited, 15, True, True)
        df_cd = pd.merge(flexibility_df_rater, flexibility_df_method, how='inner', on = 'id')
        flexibility_df_list.append(df_cd)
        
    return flexibility_df_list

In [235]:
flex_results_list = print_flexibility_corrs()

box


  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


KeyboardInterrupt: 

### Write Results into Excel Sheet

In [15]:
prompts_list = ['cup', 'key', 'rope', 'brick', 'chair', 'pencil', 'shoe']
data_list = [data_test_cup, data_test_key, data_test_rope, data_test_brick, data_test_chair, data_test_pencil, data_test_shoe]

In [20]:
# # write results df of each dataset for specific methods
# def write_results_excel(method, stopwords, remove_sw, join_list):
#     # change this when writing new sheet
#     writer = pd.ExcelWriter('originality_cv_freq_cluster_results.xlsx', engine='xlsxwriter')

#     for i in range(len(prompts_list)):
#         df = method(data_list[i], stopwords, remove_sw, join_list)
#         df.to_excel(writer, sheet_name = prompts_list[i], index = False)
#     writer.save()

In [17]:
# write_results_excel(get_originality_count_vectorizer, stopwords_spacy, True, True)

Algo Design Brainstorming:

To Do List
- [ ] brainstorm strategy
