# Fluency Algo

## Algorithm to Automate Fluency Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

from scipy import stats
from scipy.stats import pearsonr

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_official_cup = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
data_official_key = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
data_official_rope = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
data_official_brick = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
data_official_chair = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
data_official_pencil = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
data_official_shoe = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
data_official_box = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

# when on mac
# data_official_cup = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
# data_official_key = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
# data_official_rope = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
# data_official_brick = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
# data_official_chair = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
# data_official_pencil = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
# data_official_shoe = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
# data_official_box = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

### Preprocessing

In [3]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [4]:
stopwords_edited = list(stopwords_spacy)
stopwords_edited.append("thing")
stopwords_edited.append("things")
stopwords_edited.append("use")

In [5]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub("wedging","wedge", text)
    text = re.sub("exersizing","exercising", text)
    text = re.sub("thrown","throw", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [6]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [7]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

## Fluency Algo 1
### counting rows belonging to a participant

In [8]:
def get_fluency_score(fluency_rating_df):
    # get id list
    id_list = get_id_list(fluency_rating_df)
    participants_fluency = {k: 0 for k in id_list}
    
    for participant in id_list:
        temp_df = fluency_rating_df.loc[fluency_rating_df['id'] == participant]
        participants_fluency[participant] = len(temp_df.index)
        
    fluency_score_df = pd.DataFrame(participants_fluency.items(), columns=['id', 'fluency'])
    
    return fluency_score_df

In [9]:
def get_fluency(df, stopwords_list, remove_sw, join_list):
    fluency_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    responses_split = fluency_rating_df['response_processed'].tolist()
    responses_split = [word for word in responses_split if word != '']
    fluency_rating_df = fluency_rating_df[fluency_rating_df.astype(str)['response_processed'] != '']
    
    
                
    # add fleucny df
    fluency_score_df = get_fluency_score(fluency_rating_df)
        
    return fluency_score_df

In [10]:
get_fluency(data_official_box, stopwords_edited, True, True)

Unnamed: 0,id,fluency
0,1087,4
1,1093,8
2,1094,4
3,1102,7
4,1104,2
...,...,...
84,1599,3
85,1603,5
86,1610,7
87,1614,3


## Comparing Algo Results with Human Ratings

In [11]:
prompts_list = ['box', 'brick', 'chair', 'cup', 'key', 'pencil', 'rope', 'shoe']
data_list = [data_official_box, data_official_brick, data_official_chair, data_official_cup, data_official_key, data_official_pencil, data_official_rope, data_official_shoe]

### Collect the Method Results

In [12]:
def save_fluency_scores():
    fluency_results_list = []
    for i in range(len(prompts_list)):
        fluency_results_list.append(get_fluency(data_list[i], stopwords_edited, True, True))
        
    return fluency_results_list

In [13]:
flex_results_list = save_fluency_scores()

In [14]:
# write out the flexibility results
def write_fluency_results(flex_results_list):
    writer = pd.ExcelWriter('fluency_methods_results_071421.xlsx', engine='xlsxwriter')
    workbook = writer.book
    
    for i in range(len(prompts_list)):
        flex_results_list[i].to_excel(writer, sheet_name = prompts_list[i], startrow = 0, startcol = 0, index = False)
        
    writer.save()

In [16]:
# write_fluency_results(flex_results_list)

Algo Design Brainstorming:

To Do List
- [ ] brainstorm strategy
