In [37]:
# Import libraries

import pandas as pd

# Load data/Noah-Noah-Discord-Conversation.csv

# In this dataset, the message author is given in column 4 (the fifth column),
# and the message content is given in column 15 (the sixteenth column).
# The dataset contains all the direct messages between Noah (diruslupito in the data) and Noah (gamemaster618) on Discord before February 29, 2024.
data = pd.read_csv('data/Alex-Noah-Discord-Conversation.csv')

In [38]:
# Exploratory data analysis

# Print the dataframe's head (most recent 5 by default) messages, 
# with author as the leftmost column and message content as the rightmost column
display(data.iloc[:, [4, 15]].head())

Unnamed: 0,author.username,content
0,diruslupito,"ok he said yes, ill make a gc"
1,diruslupito,I'll ask my roommate if you can join our 422 g...
2,gamemaster618,Yeah this class is gonna suck
3,diruslupito,Might as well do practice problems
4,gamemaster618,Up front near the middle


In [39]:
# Count the total number of messages sent by each person
display(data.iloc[:, 4].value_counts())

author.username
diruslupito      326
gamemaster618    199
Name: count, dtype: int64

In [40]:
# Count the number of words sent in the most recent message
most_recent_message = data.iloc[0, 15]
display(most_recent_message)
print('Words in that message: ', len(most_recent_message.split()))

'ok he said yes, ill make a gc'

Words in that message:  8


In [41]:
# Count the number of words sent across all messages, 
# separated by author
sumAlex = 0
sumNoah = 0
numTextMessagesByAlex = 0
numTextMessagesByNoah = 0
for i in range(len(data)):
    # Skip NaN values
    if pd.isnull(data.iloc[i, 15]):
        continue
    if data.iloc[i, 4] == 'diruslupito':
        numTextMessagesByAlex += 1
        sumAlex += len(data.iloc[i, 15].split())
    else:
        numTextMessagesByNoah += 1
        sumNoah += len(data.iloc[i, 15].split())
print('Number of text messages by Alex:', numTextMessagesByAlex)
print('Number of text messages by Noah:', numTextMessagesByNoah)
print('Number of text messages:', numTextMessagesByAlex + numTextMessagesByNoah)
print('Total number of messages:', len(data))
print('Number of words written by Alex:', sumAlex)
print('Number of words written by Noah:', sumNoah)
print('Total number of words written:', sumAlex + sumNoah)
print('Average per message by Alex:', sumAlex / numTextMessagesByAlex)
print('Average per message by Noah:', sumNoah / numTextMessagesByNoah)
print('Average per message:', (sumAlex + sumNoah) / (numTextMessagesByAlex + numTextMessagesByNoah))

Number of text messages by Alex: 303
Number of text messages by Noah: 197
Number of text messages: 500
Total number of messages: 525
Number of words written by Alex: 2420
Number of words written by Noah: 1750
Total number of words written: 4170
Average per message by Alex: 7.986798679867987
Average per message by Noah: 8.883248730964468
Average per message: 8.34


In [42]:
# Look at the most recent message, and add all of its words to a map which maps words to their frequency
wordFrequency = {}
for word in most_recent_message.split():
    if word in wordFrequency:
        wordFrequency[word] += 1
    else:
        wordFrequency[word] = 1
print(wordFrequency)

{'ok': 1, 'he': 1, 'said': 1, 'yes,': 1, 'ill': 1, 'make': 1, 'a': 1, 'gc': 1}


In [43]:
# Look at all messages, and add all of their words to a map which maps words to their frequency,
# separated by author

wordFrequencyAlex = {}
wordFrequencyNoah = {}
wordFrequencyBoth = {}

for i in range(len(data)):
    # Skip NaN values
    if pd.isnull(data.iloc[i, 15]):
        continue
    for word in data.iloc[i, 15].split():
        if data.iloc[i, 4] == 'diruslupito':
            if word in wordFrequencyAlex:
                wordFrequencyAlex[word] += 1
            else:
                wordFrequencyAlex[word] = 1
        else:
            if word in wordFrequencyNoah:
                wordFrequencyNoah[word] += 1
            else:
                wordFrequencyNoah[word] = 1
        if word in wordFrequencyBoth:
            wordFrequencyBoth[word] += 1
        else:
            wordFrequencyBoth[word] = 1

print('Number of distinct words used by Alex:', len(wordFrequencyAlex))
print('Number of distinct words used by Noah:', len(wordFrequencyNoah))
print('Number of distinct words used by both:', len(wordFrequencyBoth))

# Add the words to a list, sorted in descending order of frequency
wordListAlex = sorted(wordFrequencyAlex, key=wordFrequencyAlex.get, reverse=True)
wordListNoah = sorted(wordFrequencyNoah, key=wordFrequencyNoah.get, reverse=True)
wordListBoth = sorted(wordFrequencyBoth, key=wordFrequencyBoth.get, reverse=True)


# Print the most used words by each person, and the most used words by both
numWordsToPrint = 15

# Alex's most used words
printstr = 'Alex\'s most used ' + str(numWordsToPrint) + ' words are: '
for i in range(numWordsToPrint):
    printstr += wordListAlex[i]
    printstr += '(' + str(wordFrequencyAlex[wordListAlex[i]]) + ' uses)'
    printstr += ', '
printstr = printstr[:-2]
printstr += '.'
print(printstr)

# Noah's most used words
printstr = 'Noah\'s most used ' + str(numWordsToPrint) + ' words are: '
for i in range(numWordsToPrint):
    printstr += wordListNoah[i]
    printstr += '(' + str(wordFrequencyNoah[wordListNoah[i]]) + ' uses)'
    printstr += ', '
printstr = printstr[:-2]
printstr += '.'
print(printstr)

# Most used words by both
printstr = 'The most used ' + str(numWordsToPrint) + ' words by both are: '
for i in range(numWordsToPrint):
    printstr += wordListBoth[i]
    printstr += '(' + str(wordFrequencyBoth[wordListBoth[i]]) + ' uses)'
    printstr += ', '
printstr = printstr[:-2]
printstr += '.'
print(printstr)

Number of distinct words used by Alex: 893
Number of distinct words used by Noah: 776
Number of distinct words used by both: 1369
Alex's most used 15 words are: i(86 uses), the(72 uses), to(56 uses), it(55 uses), a(50 uses), in(31 uses), this(30 uses), for(29 uses), and(29 uses), of(26 uses), is(26 uses), like(24 uses), my(22 uses), on(20 uses), not(18 uses).
Noah's most used 15 words are: I(64 uses), the(56 uses), to(45 uses), it(44 uses), is(32 uses), a(31 uses), that(27 uses), was(23 uses), Yeah(19 uses), you(19 uses), in(18 uses), and(18 uses), on(17 uses), of(15 uses), for(15 uses).
The most used 15 words by both are: the(128 uses), to(101 uses), it(99 uses), i(86 uses), I(82 uses), a(81 uses), is(58 uses), in(49 uses), and(47 uses), for(44 uses), this(42 uses), of(41 uses), that(40 uses), on(37 uses), was(36 uses).


In [44]:
# Add each message to a corpus. Each message is separated by a comma.

corpus = []
originalMessageIndexMap = {}
for i in range(len(data)):
    # Skip over NaN values
    if pd.isnull(data.iloc[i, 15]):
        continue
    # Skip over messages with only one word
    if len(data.iloc[i, 15].split()) == 1:
        continue
    corpus.append(data.iloc[i, 15])
    originalMessageIndexMap[len(corpus) - 1] = i
    
display(corpus)

['ok he said yes, ill make a gc',
 "I'll ask my roommate if you can join our 422 group",
 'Yeah this class is gonna suck',
 'Might as well do practice problems',
 'Up front near the middle',
 'Better than snoozing through 3 hours of lecture',
 "Didn't realize you were in this class too",
 'That sounds riviting lmao',
 'Meth 520 linear algebruh',
 'Doing meth',
 'Question is that you sitting in the corner in 422 rn?',
 'Maybe it just means taking both is an excess of units but who knows',
 'though, idk what that "Course provides excess units, Approved Exception" line means since MA 405 is the same number of credits as 305',
 "well wouldn't you look at that, it worked",
 'So true',
 'cant wait for this to somehow result in a colossal fuckup in their system that crashes everything and i get 1000 F grades',
 'Was hidden in a subfolder',
 'i think so anyways',
 'This what you need?',
 'post it',
 'Found it',
 'nothing to do with substitutions as far as i can find',
 'it just links to the ad

In [45]:
# Use TF-IDF to find the most important words in the corpus

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_matrix

<449x1059 sparse matrix of type '<class 'numpy.float64'>'
	with 3611 stored elements in Compressed Sparse Row format>

In [46]:
# Use a K-NN model to find the most similar messages to the most recent message, 
# using 7 neighbors

from sklearn.neighbors import NearestNeighbors
sevenNbrs = NearestNeighbors(n_neighbors=7).fit(tfidf_matrix)

# Find the most similar messages to the given message

mostRecentMessageTFIDF = vectorizer.transform(['math'])
distances, indices = sevenNbrs.kneighbors(mostRecentMessageTFIDF)
print('Most similar messages using 7 neighbors:')
for i in range(7):
    print(corpus[indices[0][i]])
for i in range(7):
    print(data.iloc[originalMessageIndexMap[indices[0][i]], [4,15]])
print(f'Distances: {distances[0]}')

Most similar messages using 7 neighbors:
its really similar to my math hws
312/320 here
I feel like I should be fine but claculating the HW grade is such a pain I dont want to bother yknow
What thier name lol dont wanna just walk up like a creep
Maybe it just means taking both is an excess of units but who knows
i need the one for this
like when i made a dfa that accepted a single string like "blah" when they wanted it to only accept "b", "l", "a" or "h" (something like that, i forgot exactly
I have no idea peobably todays lecture though 
Im annoyed I overslept
author.username                          diruslupito
content            its really similar to my math hws
Name: 232, dtype: object
author.username                                        gamemaster618
content            312/320 here\nI feel like I should be fine but...
Name: 56, dtype: object
author.username                                        gamemaster618
content            What thier name lol dont wanna just walk up li...
N