# Virtual Internships Project

## Import Modules and Data

First we will import all relevant modules. We will then import our csv as a pandas dataframe for easy use.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('virtualInternshipData_ADS2001.csv', encoding= 'unicode_escape') #read the csv provided
df = df.drop("Unnamed: 0",axis=1) #drop the unnamed column
df

Unnamed: 0,userIDs,implementation,Line_ID,ChatGroup,content,group_id,RoleName,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,OutcomeScore,wordCount
0,1,a,1,PRNLT,Hello team. Welcome to Nephrotex!,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,5
1,1,a,2,PRNLT,I'm Maria Williams. I'll be your design adviso...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,11
2,1,a,3,PRNLT,I'm here to help if you have any questions.,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,9
3,1,a,4,PRNLT,Please introduce yourselves with the name you ...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,1,0,0,4,51
4,1,a,5,PRNLT,I just want to make sure everyone has found th...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19175,392,o,19179,PESPVP,yes,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19176,388,o,19180,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,8,2
19177,367,o,19181,PESPVP,"Well, we are out of time for our meeting.",6,Mentor,Reflection team discussion of first batch results,0,0,0,0,0,0,4,9
19178,393,o,19182,PESPVP,Precisely,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,4,1


## Stop Words
When working with text or natural language, there are certain words that don't add any value to a sentence e.g. "this" and so we will need to remove these words. 

In [3]:
from nltk.corpus import stopwords 
import string

df['content'] = df['content'].str.lower() #make all the letters lowercase for easy of use

stop = stopwords.words('english') #import english stopwords from nltk 
additional_stopwords = ["i'm", "i'll"] #add any additional stop words not from nltk
stop = stop + additional_stopwords

df['content_without_stopwords'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove all stopwords in content column

df['content_without_stopwords'] = df['content_without_stopwords'].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation))) #remove all punctuation

df['content_without_stopwords'] = df['content_without_stopwords'].apply(lambda x: "".join([i for i in x if not i.isdigit()])) #remove all digits

print('With stop words: ' + df['content'][1])
print('Without stop words: ' + df['content_without_stopwords'][1])

With stop words: i'm maria williams. i'll be your design advisor for your internship.
Without stop words: maria williams design advisor internship


## Tokenizing

In natural language processing, each document or sentence can thought of as a bag of words in the form of a list where each element is a word...

In [4]:
from nltk.tokenize import word_tokenize
df['content_tokenized'] = df.apply(lambda row: word_tokenize(row['content_without_stopwords']), axis=1) #tokenize all the content

Splitting the document up like this is called <u>tokenizing</u>.

In [5]:
df['content_tokenized'][0]

['hello', 'team', 'welcome', 'nephrotex']

In [6]:
#all words found in the content
word_set = set().union(*df['content_tokenized'])
print(word_set)

deliver', 'multidimensional', 'bit', 'learn', 'pototypes', 'beast', 'reliabilty', 'vastly', 'design', 'necessary', 'claimed', 'developments', 'friend', 'dont', 'tab', 'proposed', 'wound', 'trivial', 'redo', 'separate', 'models', 'jamesrude', 'taht', 'hinderence', 'hmm', 'ooops', 'portal', 'satisfy', 'safe', 'surface', 'certainly', 'upper', 'sent', 'analyses', 'slots', 'woohoo', 'dj', 'reilability', 'wellwritten', 'mackenzi', 'summer', 'chemistry', 'succesful', 'about', 'accepts', 'kristins', 'wifi', 'reactability', 'cuz', 'assistance', 'samuel', 'commercially', 'formal', 'enabled', 'whereas', 'joes', 'earned', 'diffused', 'studying', 'accounted', 'turnaround', 'better', 'marc', 'generating', 'realatively', 'custom', 'ready', 'betathromogobulin', 'asoect', 'since', 'suractants', 'recommending', 'robert', 'battling', 'condense', 'treatments', 'assigment', 'cracking', 'plausible', 'reid', 'describing', 'chose', 'profiles', 'phases', 'higher', 'physics', 'writeups', 'perceived', 'brianwe',

In [7]:
#splitting each sublist into all content said by each user
user_dict = [] #creating an empty list to store all the sublist of all the words said by each user
for idx in df['userIDs'].unique(): #loop over every unique id
    lst = [word_tokenize(i) for i in df[df['userIDs'] == idx]['content_without_stopwords'].to_list()] #tokenize the contents of each row
    tokenized_sents = [item for sublist in lst for item in sublist] #re-formatting 
    user_dict.append(tokenized_sents) #append the sublist into the user_dict list

In [8]:
print(user_dict[0]) #printing all content that user 1 said

['hello', 'team', 'welcome', 'nephrotex', 'maria', 'williams', 'design', 'advisor', 'internship', 'help', 'questions', 'please', 'introduce', 'name', 'prefer', 'called', 'workpro', 'records', 'work', 'do', 'review', 'external', 'consultant', 'improve', 'quality', 'internship', 'program', 'ask', 'use', 'first', 'name', 'internship', 'protect', 'privacy', 'want', 'make', 'sure', 'everyone', 'found', 'chat', 'interface', 'please', 'send', 'chat', 'check', 'in', 'group', 'make', 'chat', 'window', 'bigger', 'clicking', 'icon', 'top', 'right', 'corner', 'already', 'please', 'check', 'email', 'throughout', 'time', 'nephrotex', 'receiving', 'emails', 'boss', 'alex', 'hell', 'send', 'instructions', 'work', 'throughout', 'course', 'internship', 'now', 'please', 'check', 'deliverable', 'list', 'bottom', 'right', 'corner', 'workpro', 'deliverables', 'assigned', 'show', 'list', 'able', 'check', 'see', 'original', 'email', 'instructions', 'alex', 'sent', 'you', 'whether', 'submitted', 'deliverable',

In [9]:
#creating a list where each sublist is a dictionary that counts the frequency of all the words that are used by a user
word_dict = [] #create an empty list to store all the words used
for i in range(len(user_dict)):
    #creating dictionaries to keep count of the words
    temp_word_dict = dict.fromkeys(word_set, 0)
    word_dict.append(temp_word_dict) #append each dictionary to the word dictionary 
    
    #count the words in the bag of words for each user
    for word in user_dict[i]:
        word_dict[i][word] += 1

In [10]:
print(word_dict[0]) #printing the dictionary of user 1

nefficient': 0, 'unanimous': 0, 'marketabuility': 0, 'boss': 6, 'selected': 0, 'discuss': 0, 'somereason': 0, 'desire': 0, 'haley': 0, 'redid': 0, 'difference': 0, 'evening': 0, 'reset': 0, 'far': 0, 'impact': 0, 'ahve': 0, 'tailored': 0, 'alter': 0, 'compensated': 0, 'somebody': 0, 'recommandations': 0, 'differnt': 0, 'exceeded': 0, 'polygon': 0, 'weighs': 0, 'straightforward': 0, 'decline': 0, 'listen': 0, 'various': 0, 'earlier': 6, 'chatting': 0, 'project': 6, 'wynters': 0, 'elses': 0, 'reach': 0, 'define': 0, 'feared': 0, 'additionally': 0, 'yas': 0, 'llooks': 0, 'mattered': 0, 'mckenna': 0, 'assumptions': 0, 'midpoint': 0, 'asks': 0, 'god': 0, 'outcome': 0, 'looks': 0, 'sum': 0, 'drawbacks': 0, 'efficiently': 0, 'amen': 0, 'roomate': 0, 'stable': 0, 'unequal': 0, 'block': 0, 'spotty': 0, 'feasible': 0, 'depisition': 0, 'reults': 0, 'informed': 0, 'xing': 0, 'deem': 0, 'inver': 0, 'worth': 0, 'lacked': 0, 'tht': 0, 'surfacant': 0, 'cellulose': 0, 'havent': 0, 'tremendously': 0, 'r

## TF-IDF

Rather than just counting, we can use <u>TF-IDF</u>, short for term frequency-inverse document frequency to rank a word on it's importance.

The <u>TF-IDF</u> score of a word $w$ is:
$$tf(w) * idf(w)$$
Where $tf(w) =$ frequency of word in a document / total number of words in the document

And where $idf(w) = log$(number of documents / number of documents that contain word $w$)

In [11]:
def computeTF(word_dict, user_dict):
    tf_dict = {}
    user_dict_count = len(user_dict)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(user_dict_count)
    return tf_dict

def computeIDF(doc_list):
    import math
    idf_dict = {}
    n = len(doc_list)

    idf_dict = dict.fromkeys(doc_list[0].keys(),0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1

    for word, val in idf_dict.items():
        idf_dict[word] = math.log(n / float(val), 10)

    return idf_dict

def computeTFIDF(tf_user_dict, idfs):
    tfidf = {}
    for word, val in tf_user_dict.items():
        tfidf[word] = val * idfs[word]
    return tfidf

### Example of TF-IDF

Suppose we have two documents as listed below. The calculation of <u>TF-IDF</u> for the term "hello" is performed as: 

The <u>TF</u>, is the frequency that the word "hello" appears in each document. In each document, the word appears once; but as document 1 (index 0) has more words, its relative frequency is smaller.

$$ tf('hello', doc1) = \frac{1}{4} = 0.25 $$
$$ tf('hello', doc2) = \frac{1}{2} = 0.5 $$

An <u>IDF</u> accounts for the ratio of documents that include the word "hello". In this case, we have a total of two documents and all of them include the word "hello".

$$ idf('hello', documents) = log(\frac{2}{2}) = 0 $$

So <u>TF-IDF</u> is 0 for the word "hello" implying that the word is not very informative as it appears in all documents.

$$ tfidf('hello', doc1, documents) = 0.25 * 0 = 0 $$
$$ tfidf('hello', doc2, documents) = 0.5 * 0 = 0 $$

Take the word "team", it occurs once only in document 1:

$$ tf('team', doc1) = \frac{1}{4} = 0.25 $$
$$ tf('team', doc2) = \frac{0}{2} = 0 $$
$$ idf('team', documents) = log(\frac{2}{1}) \approx 0.301 $$

Therefore,

$$ tfidf('team', doc1, documents) = tf('team', doc1) * idf('team', documents) = 0.25 * 0.301 \approx 0.075 $$
$$ tfidf('team', doc2, documents) = tf('team', doc2) * idf('team', documents) = 0 * 0.301 = 0 $$

In [12]:
doc1 = word_tokenize(df['content_without_stopwords'][0])
doc2 = word_tokenize(df['content_without_stopwords'][5])

word_set_example = set(doc1).union(set(doc2))
word_dict1 = dict.fromkeys(word_set_example, 0)
word_dict2 = dict.fromkeys(word_set_example, 0)

for word in doc1:
    word_dict1[word] += 1

for word in doc2:
    word_dict2[word] += 1

pd.DataFrame([word_dict1, word_dict2])

Unnamed: 0,hello,brandon,welcome,team,nephrotex
0,1,0,1,1,1
1,1,1,0,0,0


In [13]:
tf1_example = computeTF(word_dict1, doc1)
tf2_example = computeTF(word_dict2, doc2)

idf_example = computeIDF([word_dict1, word_dict2])

tfidf1_example = computeTFIDF(tf1_example, idf_example)
tfidf2_example = computeTFIDF(tf2_example, idf_example)

print("hello")
print("tf for document 1: " + str(tf1_example['hello']))
print("tf for document 2: " + str(tf2_example['hello']))
print("idf for documents: " + str(idf_example['hello']))
print("tfidf for document 1: " + str(tfidf1_example['hello']))
print("tfidf for document 2: " + str(tfidf2_example['hello']))
print("")
print("team")
print("tf for document 1: " + str(tf1_example['team']))
print("tf for document 2: " + str(tf2_example['team']))
print("idf for documents: " + str(idf_example['team']))
print("tfidf for document 1: " + str(tfidf1_example['team']))
print("tfidf for document 2: " + str(tfidf2_example['team']))

hello
tf for document 1: 0.25
tf for document 2: 0.5
idf for documents: 0.0
tfidf for document 1: 0.0
tfidf for document 2: 0.0

team
tf for document 1: 0.25
tf for document 2: 0.0
idf for documents: 0.30102999566398114
tfidf for document 1: 0.07525749891599529
tfidf for document 2: 0.0


In [14]:
idfs = computeIDF(word_dict) #compute idf
tf = [] #create empty list to append tf values
tfidf = [] #create empty list to append tf-idf values
for i in range(len(user_dict)): 
    temp_tf_user_dict = computeTF(word_dict[i], user_dict[i]) #compute tf
    temp_tfidf_user_dict = computeTFIDF(temp_tf_user_dict, idfs) #compute tf-idf
    tf.append(temp_tf_user_dict) #append tf values into list
    tfidf.append(temp_tfidf_user_dict) #append tf-idf values into list

In [15]:
df_tf = pd.DataFrame.from_records(tf)
df_tf.index = df_tf.index + 1 #shift index over by 1 to match the user id 
df_tf

Unnamed: 0,exceed,erics,hm,betathrom,exactky,dyjet,cohesiveness,amount,drops,recap,...,summarized,connecting,known,cold,posted,him,cancle,everthing,spoonfed,campus
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009704,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.002933,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
390,0.004854,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
391,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
392,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [16]:
df_tfidf = pd.DataFrame.from_records(tfidf) #make the matrix into a dataframe
df_tfidf.index = df_tfidf.index + 1
df_tfidf

Unnamed: 0,exceed,erics,hm,betathrom,exactky,dyjet,cohesiveness,amount,drops,recap,...,summarized,connecting,known,cold,posted,him,cancle,everthing,spoonfed,campus
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010314,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.003117,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
390,0.009672,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
391,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
392,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


## Analysis

In [17]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [18]:
outcome_score = df.drop_duplicates(subset=['userIDs'])['OutcomeScore'].to_numpy() #grab outcome score of each individual user

In [19]:
user_id = np.argmax(outcome_score)
print("User " + str(user_id) + " outcome score: " + str(outcome_score[user_id]) + '\n')

print(df_tf.loc[user_id].sort_values(ascending=False)[:10])

print()

print(df_tfidf.loc[user_id].sort_values(ascending=False)[:10])

User 376 outcome score: 8

yes           0.023747
negative      0.021108
prototypes    0.021108
one           0.018470
would         0.018470
agree         0.015831
charge        0.015831
flux          0.015831
steric        0.015831
best          0.013193
Name: 376, dtype: float64

zachary        0.015003
eliminating    0.012102
elizabeth      0.011173
raw            0.010002
yea            0.009471
taylor         0.008844
despite        0.008414
negative       0.006858
dialyzing      0.006845
categorizes    0.006845
Name: 376, dtype: float64


In [20]:
user_id = np.argmin(outcome_score)
print("User " + str(user_id) + " outcome score: " + str(outcome_score[user_id]) + '\n')

print(df_tf.loc[user_id].sort_values(ascending=False)[:10])

print()

print(df_tfidf.loc[user_id].sort_values(ascending=False)[:10])

User 14 outcome score: 0

lasted         0.046332
surfactant     0.046332
hydrophilic    0.034749
steric         0.030888
hindrance      0.023166
hours          0.023166
surfactants    0.019305
biological     0.015444
two            0.015444
think          0.015444
Name: 14, dtype: float64

lasted          0.120204
productivity    0.030051
dropping        0.026564
hours           0.024332
full            0.021955
hour            0.017148
hindrance       0.014609
times           0.013471
hydrophilic     0.012296
long            0.010577
Name: 14, dtype: float64


In [49]:
from sklearn.model_selection import train_test_split # for splitting the data into training and testing sets
from sklearn.linear_model import LogisticRegression # import the LogisticRegression model

X = df_tfidf
Y = outcome_score

# split the data into 80% training and 20% testing, random_state=0 ensures that the results are repeatable
X_train,X_test,y_train,y_test=train_test_split(X,Y,train_size=0.8,random_state=0)  

# instantiate the model (using the default parameters)
# penalty='none' implies no regularization and solver='lbfgs' is the default solver
# different solvers can be used, dependent on the type of penalties that are implemented
logreg = LogisticRegression(solver='lbfgs',penalty='none')
logreg.fit(X_train,y_train) # fit the training data to the model

y_preda = logreg.predict_proba(X_test) # calculate the probabilities for the test features
# print out the probability table with a header
print('Probability table for testing set is:')
print(y_preda)

Probability table for testing set is:
[[1.08955394e-278 5.37459014e-288 3.11746239e-177 1.09043578e-019
  1.10777551e-148 1.00000000e+000 4.00272349e-213 5.06558693e-193
  2.38244696e-105]
 [0.00000000e+000 0.00000000e+000 1.00000000e+000 7.81517685e-108
  0.00000000e+000 0.00000000e+000 6.39729629e-284 0.00000000e+000
  0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
  1.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
  0.00000000e+000]
 [6.04696891e-134 0.00000000e+000 5.06923209e-127 1.00000000e+000
  1.48100592e-091 3.92674989e-226 5.85540688e-199 4.76293950e-230
  4.13893671e-109]
 [4.68672994e-101 0.00000000e+000 9.07076396e-299 1.57360207e-159
  1.00000000e+000 9.97792163e-149 7.46886532e-142 1.62087621e-237
  2.05209085e-119]
 [6.09382361e-126 9.76674834e-287 1.00522094e-124 1.10189557e-090
  7.47061997e-050 1.00000000e+000 8.76478543e-108 1.87927993e-159
  1.09323979e-052]
 [5.39703301e-276 1.38820251e-286 2.60795506e-156 1.0000

In [50]:
y_pred=logreg.predict(X_test) # calculate the predicted values of the model for the test features

from sklearn.metrics import accuracy_score, precision_score, recall_score # import the score functions 
print("Accuracy:",accuracy_score(y_test, y_pred)) # calculate and print the accuracy score

Accuracy: 0.26582278481012656


In [22]:
documents = [] 
for i in range(len(user_dict)): 
    doc = " ".join(user_dict[i]) 
    documents.append(doc) 

df_documents = pd.DataFrame()
df_documents["document"] = documents
df_documents["outcome_score"] = outcome_score
df_documents

Unnamed: 0,document,outcome_score
0,hello team welcome nephrotex maria williams de...,4
1,hello brandon imperative know answers question...,4
2,zelin hi mean two relative table pattern biolo...,4
3,jack flux time equals matter teh helpful showe...,4
4,hey rachel agree cost factor would rather bett...,2
...,...,...
388,hi royce yeah noticed thing cant right even fa...,7
389,carly okay kind lost find marketability cost t...,4
390,ellie used flux found steric hindering perform...,5
391,hi cyrus anyone else receive email incorrect v...,5
