In [1]:
# Import the pandas library to help read the data

import pandas as pd
import os


In [2]:
# Explore the anonymized csv 

anonDf = pd.read_csv("data/anon.csv")

In [3]:
# Count the total number of messages sent by each person
display(anonDf['username'].value_counts())

# Also find the total number of messages sent overall
print("Total number of messages sent:", anonDf.shape[0])

username
author4     1851
author2     1340
author6      698
author5      492
author7      457
author13     328
author14     321
author16     270
author10     262
author1      123
author11      73
author15      49
author12      27
author25      19
author8       17
author17      16
author27      13
author18      12
author29      11
author28      11
author21      11
author9       10
author23       9
author3        6
author30       6
author20       3
author24       2
author22       1
author26       1
author19       1
author31       1
Name: count, dtype: int64

Total number of messages sent: 6441


In [4]:
# Find the total number of words sent by each person
anonDf['wordCount'] = anonDf['content'].apply(lambda x: len(x.split()))
display(anonDf.groupby('username')['wordCount'].sum())

# Also find the total overall word count
print("Total word count: ", anonDf['wordCount'].sum())

username
author1       941
author10     1406
author11      871
author12      226
author13     3586
author14     2850
author15      259
author16     2605
author17      189
author18      151
author19       10
author2     12415
author20       18
author21      128
author22       36
author23       73
author24        5
author25      131
author26        5
author27       84
author28       71
author29       64
author3        42
author30       55
author31        4
author4     18080
author5      5528
author6      5418
author7      3216
author8       138
author9        52
Name: wordCount, dtype: int64

Total word count:  58657


In [5]:
# Find the average number of words per message per person
anonDf['wordCountPerMessage'] = anonDf['wordCount']
display(anonDf.groupby('username')['wordCountPerMessage'].mean())

# Also find the average number of words per message overall
print("Average number of words per message: ", anonDf['wordCountPerMessage'].mean())

username
author1      7.650407
author10     5.366412
author11    11.931507
author12     8.370370
author13    10.932927
author14     8.878505
author15     5.285714
author16     9.648148
author17    11.812500
author18    12.583333
author19    10.000000
author2      9.264925
author20     6.000000
author21    11.636364
author22    36.000000
author23     8.111111
author24     2.500000
author25     6.894737
author26     5.000000
author27     6.461538
author28     6.454545
author29     5.818182
author3      7.000000
author30     9.166667
author31     4.000000
author4      9.767693
author5     11.235772
author6      7.762178
author7      7.037199
author8      8.117647
author9      5.200000
Name: wordCountPerMessage, dtype: float64

Average number of words per message:  9.106815711845986


In [6]:
# Look at all messages, count unique words

wordFrequency = {}

for i in range(len(anonDf)):
    # Skip NaN values
    if pd.isnull(anonDf.iloc[i, 1]):
        continue
    for word in anonDf.iloc[i, 1].split():
        if word in wordFrequency:
            wordFrequency[word] += 1
        else:
            wordFrequency[word] = 1

print('Number of distinct words used:', len(wordFrequency))

# Add the words to a list, sorted in descending order of frequency
wordList = sorted(wordFrequency, key=wordFrequency.get, reverse=True)


# Print the most used words by each person, and the most used words by both
numWordsToPrint = 15

# Most used words by both
printstr = 'The most used ' + str(numWordsToPrint) + ' words are: '
for i in range(numWordsToPrint):
    printstr += wordList[i]
    printstr += '(' + str(wordFrequency[wordList[i]]) + ' uses)'
    printstr += ', '
printstr = printstr[:-2]
printstr += '.'
print(printstr)

Number of distinct words used: 9813
The most used 15 words are: the(1609 uses), I(1544 uses), to(1348 uses), a(1323 uses), it(788 uses), is(771 uses), and(766 uses), you(761 uses), of(664 uses), that(594 uses), be(579 uses), in(576 uses), for(548 uses), have(507 uses), on(397 uses).


In [7]:
# Add each message to a corpus. Each message is separated by a comma.

corpus = []
originalMessageIndexMap = {}
for i in range(len(anonDf)):
    # Skip over NaN values
    if pd.isnull(anonDf.iloc[i, 1]):
        continue
    # Skip over messages with only one word
    if len(anonDf.iloc[i, 1].split()) == 1:
        continue
    corpus.append(anonDf.iloc[i, 1])
    originalMessageIndexMap[len(corpus) - 1] = i
    
display(corpus)

["or did my acid patch thing kill him off my turn? I don't think it did",
 'pretty sure it was will but double check',
 'I think will',
 'Whoever went before Oliver',
 "I think it was either me or fable, can't really remember",
 'Cause they attacked him after he already died',
 'not Oliver',
 '<@&981382306663120919> who landed the killing blow on Borgbreill?',
 'I would like to point out that William has inadvertently created the church of Hanzo',
 'Btw <@&981382306663120919> give me 1 emotion your character would be feeling',
 '0 17 -15 0 -2 14 -8',
 '14 17 0 -3',
 '18 21 18 4 15 13',
 'Write message in alphabetical code\r\nSubtract the values of a looping code word',
 '4 5 1 20 8',
 '20 18 21 5',
 '6 18 5 5 4 15 13',
 '<@&981382306663120919> 30 minutes',
 'Reminder <@&981382306663120919> Sunday at 5',
 'Aight then I can run it instead of Gnome on 4/16',
 'I currently have no preference for weekend',
 'Sunday usually preferable',
 "Then let's just move it to a weekend Wed was gonna be

In [8]:
# Use TF-IDF to find the most important words in the corpus

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_matrix

<6441x6576 sparse matrix of type '<class 'numpy.float64'>'
	with 51229 stored elements in Compressed Sparse Row format>