# Virtual Internships Project

## Import Modules and Data

First we will import all relevant modules. We will then import our csv as a pandas dataframe for easy use.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt

In [2]:
df = pd.read_csv('virtualInternshipData_ADS2001.csv', encoding= 'unicode_escape') #read the csv provided
df = df.drop("Unnamed: 0",axis=1) #drop the unnamed column
df

Unnamed: 0.1,Unnamed: 0,userIDs,implementation,Line_ID,ChatGroup,content,group_id,RoleName,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,OutcomeScore,wordCount
0,1,1,a,1,PRNLT,Hello team. Welcome to Nephrotex!,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,5
1,2,1,a,2,PRNLT,I'm Maria Williams. I'll be your design adviso...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,11
2,3,1,a,3,PRNLT,I'm here to help if you have any questions.,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,9
3,4,1,a,4,PRNLT,Please introduce yourselves with the name you ...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,1,0,0,4,51
4,5,1,a,5,PRNLT,I just want to make sure everyone has found th...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19175,19176,392,o,19179,PESPVP,yes,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19176,19177,388,o,19180,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,8,2
19177,19178,367,o,19181,PESPVP,"Well, we are out of time for our meeting.",6,Mentor,Reflection team discussion of first batch results,0,0,0,0,0,0,4,9
19178,19179,393,o,19182,PESPVP,Precisely,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,4,1


## Stop Words
When working with text or natural language, there are certain words that don't add any value to a sentence e.g. "this" and so we will need to remove these words. 

In [4]:
from nltk.corpus import stopwords 

df['content'] = df['content'].str.lower() #make all the letters lowercase for easy of use

stop = stopwords.words('english') #import english stopwords from nltk 
df['content_without_stopwords'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove all stopwords in content column
print('With stop words: ' + df['content_without_stopwords'][1])
print('Without stop words: ' + df['content'][1])

With stop words: i'm maria williams. i'll design advisor internship.
Without stop words: i'm maria williams. i'll be your design advisor for your internship.


## Tokenizing

In natural language processing, each document or sentence can thought of as a bag of words in the form of a list where each element is a word...

In [5]:
from nltk.tokenize import word_tokenize
df['content_tokenized'] = df.apply(lambda row: word_tokenize(row['content_without_stopwords']), axis=1) #tokenize all the content

Splitting the document up like this is called <u>tokenizing</u>.

In [6]:
df['content_tokenized'][0]

['hello', 'team', '.', 'welcome', 'nephrotex', '!']

In [7]:
#all words found in the content
word_set = set().union(*df['content_tokenized'])
print(word_set)

sors', 'handy', 'maybe', 'mention', 'excpet', 'url', 'overcome', 'somewhere', 'gladly', 'cost/performance/etc', 'ahve', 'cause', 'reasonably', 'surprises', 'testing', 'to', 'beta-t', 'papers', 'or', '......', 'coming', 'impacted', 'nice', 'standard', 'guard', 'characteristics', 'been', 'finn', 'ramping', 'concise', 'andy', 'okie', 'sipes', 'cheaper', 'longer', 'takes', 'adjusted', '1:30pm-2:10pm', '12:30am', 'particular', 'checklist', 'needs', 'not', 'pamda', 'process', '65.56', 'tim', 'refers', 'join', 'inital', '75ng/ml', '10-17', 'listening', 'mins', 'dialyzers', 'validate', 'thirdnow', 'dropped', 'viewing', 'divison', 'witnesses', 'retrieve', "'t", 'motion', 'jared', 'faster', 'bud', '5.pmma', 'topics', 'recommended', 'simulations', 'boundaries', 'gave', 'profitability', 'especially', 'issues', 'pricing', 'posssible', 'homework', 'respect', 'intended', 'don', 'unnecessarily', 'funny', 'gang', '@', 'former', 'state', 'tehrani', 'lab', 'hearing', 'refreshed', 'mates', 'agreeing', 'co

In [8]:
#splitting each sublist into all content said by each user
user_dict = [] #creating an empty list to store all the sublist of all the words said by each user
for idx in df['userIDs'].unique(): #loop over every unique id
    lst = [word_tokenize(i) for i in df[df['userIDs'] == idx]['content_without_stopwords'].to_list()] #tokenize the contents of each row
    tokenized_sents = [item for sublist in lst for item in sublist] #re-formatting 
    user_dict.append(tokenized_sents) #append the sublist into the user_dict list

In [9]:
print(user_dict[0]) #printing all content that user 1 said

ion', 'clicking', '``', 'x', "''", 'deliverables', 'list', '.', 'cancel', 'submissions', 'alex', 'viewed', 'them', ',', 'make', 'sure', 'included', 'everything', 'required', 'submitting', '.', 'soon', 'submit', 'notebook', 'alex', 'review', 'send', 'feedback', 'soon', 'can', '.', 'waiting', ',', 'assist', 'teammates', 'submitted', 'deliverable', '.', 'team', 'able', 'move', 'next', 'task', 'team', 'members', "'", 'notebooks', 'witnessed', '.', 'back', 'undergrad', ',', 'interned', 'mechanical', 'engineering', 'company', 'designed', 'exoskeletons', 'rescue', 'workers', '.', 'working', 'different', 'project', ',', 'put', 'example', 'summary', 'used', 'previous', 'project', 'shared', 'space', '.', 'alex', 'pretty', 'similar', 'boss', 'internship', ',', 'welcome', 'use', 'template', 'write', 'own', '.', 'note', 'language', 'length', 'response', '.', 'please', 'pay', 'close', 'attention', 'citation', 'methods', 'used', 'example', '.', 'know', 'alex', 'stickler', 'following', 'correct', 'cit

In [10]:
#creating a list where each sublist is a dictionary that counts the frequency of all the words that are used by a user
word_dict = [] #create an empty list to store all the words used
for i in range(len(user_dict)):
    #creating dictionaries to keep count of the words
    temp_word_dict = dict.fromkeys(word_set, 0)
    word_dict.append(temp_word_dict) #append each dictionary to the word dictionary 
    
    #count the words in the bag of words for each user
    for word in user_dict[i]:
        word_dict[i][word] += 1

In [11]:
print(word_dict[0]) #printing the dictionary of user 1

: 0, 'suited': 0, 'clean': 0, 'helllo': 0, 'efore': 0, 'hohn': 0, 'opther': 0, 'conducted': 0, 'gina': 0, 'leaving': 0, '294/3hrs=': 0, 'series': 0, 'fared': 0, 'out': 0, 'misunderstood': 0, 'shuffled': 0, 'matter': 0, '3/5': 0, 'successful': 3, 'two': 0, 'sprices': 0, 'selling': 0, 'dean': 0, 'broad': 0, 'poorly-written': 0, 'write': 6, 'recently': 0, '^': 0, 'beta-thrombogloublin': 0, 'wording': 0, 'inc': 0, 'nothing': 0, 'devies': 0, 'thanks1': 0, 'patience': 0, 'later': 0, 'affest': 0, 'cap': 0, 'that': 0, 'hydrophilic,6': 0, 'tool': 0, 'acquire': 0, 'apa': 0, '//nephrotex.com': 0, 'isnstead': 0, 'benefit': 0, '14': 0, 'mg3/mg2': 0, 'cells': 0, 'si': 0, 'sells': 0, 'caused': 0, 'coast': 0, '.5': 0, 'explaining': 0, '17so': 0, 'yours': 0, 'discussing': 0, 'ignore': 0, '12': 0, 'guys': 0, 'evey': 0, 'definitively': 0, 'puns': 0, 'gong': 0, 'trouble': 0, 'insignificant': 0, 'yields': 0, 'someones': 0, 'dealings': 0, 'costliest': 0, 'nd': 0, 'sequel': 0, 'has': 0, 'fulfilling': 0, 'req

## TF-IDF

Rather than just counting, we can use <u>TF-IDF</u>, short for term frequency-inverse document frequency to rank a word on it's importance.

The <u>TF-IDF</u> score of a word $w$ is:
$$tf(w) * idf(w)$$
Where $tf(w) =$ frequency of word in a document / total number of words in the document

And where $idf(w) = log$(number of documents / number of documents that contain word $w$)

In [12]:
def computeTF(word_dict, user_dict):
    tf_dict = {}
    user_dict_count = len(user_dict)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(user_dict_count)
    return tf_dict

def computeIDF(doc_list):
    import math
    idf_dict = {}
    n = len(doc_list)

    idf_dict = dict.fromkeys(doc_list[0].keys(),0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1

    for word, val in idf_dict.items():
        idf_dict[word] = math.log(n / float(val), 10)

    return idf_dict

def computeTFIDF(tf_user_dict, idfs):
    tfidf = {}
    for word, val in tf_user_dict.items():
        tfidf[word] = val * idfs[word]
    return tfidf

### Example of TF-IDF

Suppose we have two documents as listed below. The calculation of <u>TF-IDF</u> for the term "hello" is performed as: 

The <u>TF</u>, is the frequency that the word "hello" appears in each document. In each document, the word appears once; but as document 1 (index 0) has more words, its relative frequency is smaller.

$$ tf('hello', doc1) = \frac{1}{6} \approx 0.166 $$
$$ tf('hello', doc2) = \frac{1}{3} = 0.333 $$

An <u>IDF</u> accounts for the ratio of documents that include the word "hello". In this case, we have a total of two documents and all of them include the word "hello".

$$ idf('hello', documents) = log(\frac{2}{2}) = 0 $$

So <u>TF-IDF</u> is 0 for the word "hello" implying that the word is not very informative as it appears in all documents.

$$ tfidf('hello', doc1, documents) = 0.166 * 0 = 0 $$
$$ tfidf('hello', doc2, documents) = 0.333 * 0 = 0 $$

Take the word "team", it occurs once only in document 1:

$$ tf('team', doc1) = \frac{1}{6} \approx 0.166 $$
$$ tf('team', doc2) = \frac{0}{3} = 0 $$
$$ idf('team', documents) = log(\frac{2}{1}) \approx 0.301 $$

Therefore,

$$ tfidf('team', doc1, documents) = tf('team', doc1) * idf('team', documents) = 0.166 * 0.301 \approx 0.05 $$
$$ tfidf('team', doc2, documents) = tf('team', doc2) * idf('team', documents) = 0 * 0.301 = 0 $$

In [13]:
doc1 = word_tokenize(df['content_without_stopwords'][0])
doc2 = word_tokenize(df['content_without_stopwords'][5])

word_set_example = set(doc1).union(set(doc2))
word_dict1 = dict.fromkeys(word_set_example, 0)
word_dict2 = dict.fromkeys(word_set_example, 0)

for word in doc1:
    word_dict1[word] += 1

for word in doc2:
    word_dict2[word] += 1

pd.DataFrame([word_dict1, word_dict2])

Unnamed: 0,team,brandon,!,hello,.,welcome,nephrotex
0,1,0,1,1,1,1,1
1,0,1,1,1,0,0,0


In [14]:
tf1_example = computeTF(word_dict1, doc1)
tf2_example = computeTF(word_dict2, doc2)

idf_example = computeIDF([word_dict1, word_dict2])

tfidf1_example = computeTFIDF(tf1_example, idf_example)
tfidf2_example = computeTFIDF(tf2_example, idf_example)

print("hello")
print("tf for document 1: " + str(tf1_example['hello']))
print("tf for document 2: " + str(tf2_example['hello']))
print("idf for documents: " + str(idf_example['hello']))
print("tfidf for document 1: " + str(tfidf1_example['hello']))
print("tfidf for document 2: " + str(tfidf2_example['hello']))
print("")
print("team")
print("tf for document 1: " + str(tf1_example['team']))
print("tf for document 2: " + str(tf2_example['team']))
print("idf for documents: " + str(idf_example['team']))
print("tfidf for document 1: " + str(tfidf1_example['team']))
print("tfidf for document 2: " + str(tfidf2_example['team']))

hello
tf for document 1: 0.16666666666666666
tf for document 2: 0.3333333333333333
idf for documents: 0.0
tfidf for document 1: 0.0
tfidf for document 2: 0.0

team
tf for document 1: 0.16666666666666666
tf for document 2: 0.0
idf for documents: 0.30102999566398114
tfidf for document 1: 0.05017166594399686
tfidf for document 2: 0.0


In [15]:
idfs = computeIDF(word_dict) #compute idf
tfidf = [] #create empty list to append tf-idf values
for i in range(len(user_dict)): 
    temp_tf_user_dict = computeTF(word_dict[i], user_dict[i]) #compute tf
    temp_tfidf_user_dict = computeTFIDF(temp_tf_user_dict, idfs) #compute tf-idf
    tfidf.append(temp_tfidf_user_dict) #append tf-idf values into list

In [18]:
df_tfidf = pd.DataFrame.from_records(tfidf) #make the matrix into a dataframe
df_tfidf

Unnamed: 0,another,collected,utilized,must,refection,feed,austins,directly,saves,holly,...,machines,reading,marketabiltiy,accepts,reliablitity,exceptional,ian,problemo,brendan,edited
0,0.001370,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.005369,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004918,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,0.001164,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390,0.000000,0.0,0.000000,0.003354,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
391,0.000000,0.0,0.007594,0.004020,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Analysis

In [19]:
outcome_score = df.drop_duplicates(subset=['userIDs'])['OutcomeScore'].to_numpy() #grab outcome score of each individual user
df_tfidf['OutcomeScore'] = outcome_score 
df_tfidf

Unnamed: 0,another,collected,utilized,must,refection,feed,austins,directly,saves,holly,...,reading,marketabiltiy,accepts,reliablitity,exceptional,ian,problemo,brendan,edited,OutcomeScore
0,0.001370,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.005369,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
4,0.004918,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,0.001164,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
389,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
390,0.000000,0.0,0.000000,0.003354,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
391,0.000000,0.0,0.007594,0.004020,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
