In [27]:
# Import the pandas library to help read the data

import pandas as pd

# Read in the anonymized data from data/anon.csv
# This data can be assumed to already be cleaned

df = pd.read_csv("data/anon.csv")

In [28]:
# Split up the data into training and testing sets

X = df['content']
display(X)
Y = df['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
4       I think it was either me or fable, can't reall...
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6441, dtype: object

0       author1
1       author1
2       author2
3       author2
4       author3
         ...   
6436    author5
6437    author6
6438    author4
6439    author4
6440    author5
Name: username, Length: 6441, dtype: object

In [29]:
#Extract alternative features from the data

#Vector of num words
numWordsTrain = []
numWordsTest = []
#Vector of num chars
numCharTrain = []
numCharTest = []
#Vector of avg char/word
avgCharTrain = []
avgCharTest = []

for msg in X_train:
    print(msg)
    #NumWords
    words = msg.count(" ") + 1
    numWordsTrain.append(words)
    #NumChars
    char = len(msg)
    numCharTrain.append(char)
    #AvgChar
    avgCharTrain.append(char / words)
# Display the length of the vectors
print(len(numWordsTrain))
print(len(numCharTrain))
print(len(avgCharTrain))

for msg in X_test:
    #NumWords
    words = msg.count(" ") + 1
    numWordsTest.append(words)
    #NumChars
    char = len(msg)
    numCharTest.append(char)
    #AvgChar
    avgCharTest.append(char / words)

shops will be included
I'll make one tonight. I have yet to sit down and make my anathema and such yet, so it'll all work out and be done tonight. I'll dm a link to ya
You could just walk into room and they have to make perception to know it is you
How long was that bike ride
Goblin campaign, enough said
We should go through places on the way
I was thinking the same thing
It wasnt that he was actually living but that he was treated as being a living creature or something
Who‚Äôs ready for tornado bat
Yeah but idk if asmodeus was in on that bit
For a sec I thought that said "to host my insurance file"
lol lol
My longest has been a slayer in the aformention 1e campaign (Kingmaker)
Level 3 presumably
Yea but it's a steam key
Nvm I could 1 to 4
I'm guessing yes
noah when someone mispells his last name
maybe chance
Kobolds are like hamsters
New map? What?
I think he was referring to the orb having a %
Ah kobold bards must throw down then
I posted a summary of the overall story up to this po

In [30]:
# Add the new features to a matrix
import numpy as np

X_train_features = np.column_stack((numWordsTrain, numCharTrain, avgCharTrain))
display(X_train_features.shape)
X_test_features = np.column_stack((numWordsTest, numCharTest, avgCharTest))
display(X_test_features.shape)

(5152, 3)

(1289, 3)

In [31]:
# Fit a KNN model to the data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

for i in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_features, Y_train)

    # Predict the usernames of the test set
    Y_pred = knn.predict(X_test_features)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(Y_test, Y_pred)
    print(accuracy)

0.15981380915438323
0.13886733902249807
0.13188518231186966
0.16602017067494182
0.18386346004654772
0.19937936384794414
0.1947245927075252
0.20015515903801395
0.20791311093871218
0.22032583397982933
0.22498060512024826
0.2342901474010861
0.24127230411171452
0.2482544608223429
0.24592707525213345
0.2482544608223429
0.26532195500387895
0.2629945694336695
0.26144297905352987
0.2637703646237393
0.26764934057408846
0.2707525213343677
0.27152831652443754
0.2692009309542281
0.2660977501939488
0.25989138867339023
0.2637703646237393
0.252133436772692
0.25678820791311097
0.24592707525213345
0.25368502715283164
0.2575640031031808
0.25446082234290146
0.2552366175329713
0.2660977501939488
0.25989138867339023
0.26532195500387895
0.2629945694336695
0.2591155934833204
0.26144297905352987
0.26532195500387895
0.26687354538401864
0.26687354538401864
0.2684251357641583
0.26687354538401864
0.26764934057408846
0.26687354538401864
0.2660977501939488
0.2660977501939488
0.2699767261442979


In [32]:
# See how the model predicts the usernames of a small set of test messages
# This first message is from author2, the model should get this right since this comes from the training set
print("Train")
message = "shops will be included"
words = message.count(" ") + 1
char = len(message)
avgChar = char / words
X_test_message = np.array([[words, char, avgChar]])
print(knn.predict(X_test_message))
# This message is from author13, the model should get this right since this comes from the training set
message = "I'll make one tonight. I have yet to sit down and make my anathema and such yet, so it'll all work out and be done tonight. I'll dm a link to ya"
words = message.count(" ") + 1
char = len(message)
avgChar = char / words
X_test_message = np.array([[words, char, avgChar]])
print(knn.predict(X_test_message))
# These next 10 messages are not in the training set, so the model will have to predict the author
# The authors of these messages are respectively authors 4, 2, 14, 4, 13, 2, 10, 4, 10, 13
messages = ["Also if you haven't played with me before you guys need to do some minor set up on your end",
 "Can‚Äôt have double ability to a stat",
 "Because that feels fair",
 "AC penalty instead of attack",
 "Im tempted to play an anadi but it'd be wierd af",
 "So in theory the players are genocidal maniacs",
 "randall taking a nap",
 "Like he has one Ya'll just don't know it",
 "character named Four T'nait",
 "Fantasy continent that tristan made. Sorta generic fantasy, towns and counties that specialize in certain themes."
]
print("Test")

for msg in messages:
    words = msg.count(" ") + 1
    char = len(msg)
    avgChar = char / words
    X_test_message = np.array([[words, char, avgChar]])
    print(knn.predict(X_test_message))
 

Train
['author4']
['author13']
Test
['author2']
['author4']
['author2']
['author2']
['author4']
['author4']
['author4']
['author4']
['author4']
['author4']


In [33]:
# See if the training and testing sets are balanced

display(Y_test.value_counts())

# Create a new dataset by removing all of the authors with less than 20 posts

dfTrimmed = df.groupby('username').filter(lambda x: len(x) >= 20)
display(dfTrimmed)

username
author4     383
author2     252
author6     135
author5     103
author7      92
author14     74
author13     73
author16     54
author10     43
author1      30
author15     11
author11     11
author8       5
author25      3
author12      3
author17      3
author18      2
author9       2
author30      2
author24      2
author29      1
author23      1
author27      1
author21      1
author3       1
author19      1
Name: count, dtype: int64

Unnamed: 0,username,content
0,author1,or did my acid patch thing kill him off my tur...
1,author1,pretty sure it was will but double check
2,author2,I think will
3,author2,Whoever went before Oliver
5,author2,Cause they attacked him after he already died
...,...,...
6436,author5,Okay so basically I just found out that this A...
6437,author6,i think
6438,author4,Is the 16 before racial mods
6439,author4,Aight sounds good


In [34]:
# Split up the data into training and testing sets

X = dfTrimmed['content']
display(X)
Y = dfTrimmed['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
5           Cause they attacked him after he already died
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6291, dtype: object

0       author1
1       author1
2       author2
3       author2
5       author2
         ...   
6436    author5
6437    author6
6438    author4
6439    author4
6440    author5
Name: username, Length: 6291, dtype: object

In [35]:
# Check the value counts to make sure the data is balanced

display(Y_test.value_counts())

# Drop from x and y test until every author has the same number of posts as the author with the least posts

minPosts = Y_test.value_counts().min()
display(minPosts)

Y_test = Y_test.groupby(Y_test).head(minPosts)
display(Y_test.value_counts())

X_test = X_test[Y_test.index]
display(len(Y_test))
display(len(X_test))

username
author4     373
author2     273
author6     154
author5      98
author7      82
author13     58
author16     56
author14     56
author10     53
author1      25
author11     15
author15     10
author12      6
Name: count, dtype: int64

6

username
author1     6
author16    6
author6     6
author5     6
author4     6
author2     6
author13    6
author7     6
author10    6
author11    6
author14    6
author12    6
author15    6
Name: count, dtype: int64

78

78

In [36]:
#Extract alternative features from the data

#Vector of num words
numWordsTrain = []
numWordsTest = []
#Vector of num chars
numCharTrain = []
numCharTest = []
#Vector of avg char/word
avgCharTrain = []
avgCharTest = []

for msg in X_train:
    print(msg)
    #NumWords
    words = msg.count(" ") + 1
    numWordsTrain.append(words)
    #NumChars
    char = len(msg)
    numCharTrain.append(char)
    #AvgChar
    avgCharTrain.append(char / words)
# Display the length of the vectors
print(len(numWordsTrain))
print(len(numCharTrain))
print(len(avgCharTrain))

for msg in X_test:
    #NumWords
    words = msg.count(" ") + 1
    numWordsTest.append(words)
    #NumChars
    char = len(msg)
    numCharTest.append(char)
    #AvgChar
    avgCharTest.append(char / words)

pizza hut stuffed crust ü§§
Oh fair I guess that could be it
i want more seasons
~~Figure out how to get that monk ability to kill yourself~~
Banish them
can do! ill rush up and finish the character
I actually have a coworker named tristan
but basically
oozemorph shifter?
I vote <@492768651389435909>  out
Apparently he played through the Choral stuff as well
We're allowed to take two archetypes at level 2, right?
Presumably to replace the one they are killing off
God I wish this was legal
Normal Van Karthy things
yeah so is resurrection
<@&828644080443326494> I guess tomorrow early time maybe possibly
sounds good
All 4 of them are cracking me up
Will get on that lmfao
i am good whenever
For the one with exemplar
There may be some pve type stuff but mainly pvp
Which I dont think any useful info would be gleaned from the human trait
Probably good for 5
I am not opposed to them doing stupid shit
an update about the remaster
More like dire tiger
oooooh ooooOO
I apologize
That's why I'm wo

In [37]:
# Add the new features to a matrix

X_train_features = np.column_stack((numWordsTrain, numCharTrain, avgCharTrain))
display(X_train_features.shape)
X_test_features = np.column_stack((numWordsTest, numCharTest, avgCharTest))
display(X_test_features.shape)

(5032, 3)

(78, 3)

In [38]:
# Fit a KNN model to the data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

for i in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_features, Y_train)

    # Predict the usernames of the test set
    Y_pred = knn.predict(X_test_features)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(Y_test, Y_pred)
    print(accuracy)

0.16666666666666666
0.21794871794871795
0.21794871794871795
0.1794871794871795
0.14102564102564102
0.14102564102564102
0.16666666666666666
0.23076923076923078
0.19230769230769232
0.1794871794871795
0.15384615384615385
0.16666666666666666
0.14102564102564102
0.14102564102564102
0.14102564102564102
0.14102564102564102
0.1282051282051282
0.15384615384615385
0.14102564102564102
0.14102564102564102
0.1282051282051282
0.1282051282051282
0.1282051282051282
0.1282051282051282
0.15384615384615385
0.1282051282051282
0.14102564102564102
0.15384615384615385
0.14102564102564102
0.14102564102564102
0.14102564102564102
0.14102564102564102
0.1282051282051282
0.1282051282051282
0.15384615384615385
0.15384615384615385
0.1282051282051282
0.10256410256410256
0.10256410256410256
0.11538461538461539
0.10256410256410256
0.10256410256410256
0.10256410256410256
0.11538461538461539
0.1282051282051282
0.1282051282051282
0.11538461538461539
0.11538461538461539
0.1282051282051282
0.11538461538461539


In [39]:

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_features, Y_train)

# See how the model predicts the usernames of a small set of test messages
# This first message is from author13, the model should get this right since this comes from the training set
# Interestingly, the model gets this wrong
print("Train")
message = "Oh fair I guess that could be it"
words = message.count(" ") + 1
char = len(message)
avgChar = char / words
X_test_message = np.array([[words, char, avgChar]])
print(knn.predict(X_test_message))
# This message is from author14, the model should get this right since this comes from the training set
message = "We're allowed to take two archetypes at level 2, right?"
words = message.count(" ") + 1
char = len(message)
avgChar = char / words
X_test_message = np.array([[words, char, avgChar]])
print(knn.predict(X_test_message))
# These next 10 messages are not in the training set, so the model will have to predict the author
# The authors of these messages are respectively authors 4, 2, 14, 4, 13, 2, 10, 4, 10, 13
messages = ["new logo looks pretty pog",
 "And Thetis king was Ferguis?", 
 "Doing certain mythic paths where you ascend as a certain type of creature (angel/azata/demon/lich) vs more standard powers", 
 "I enjoyed when Gorgar Gorged all over them.",
 "Oh no the Pharaoh is using sacred geometry", 
 "fyi im busy from like 12:30 till like 2:45 the next 4 saturdays",
 "It was like the funnest alternative first person shooter I've ever played", 
 "It isn't actually severed. Just the general term for the effect. If it auto kills I'd prob have to make it be more vp than the others",
 "Hey how do yall feel about trying a house rule of confirming fumbles? Instead of an auto AOO?",
 "He thought there was no gnome because he thought I was busy when I was not busy"
]
print("Test")

for msg in messages:
    words = msg.count(" ") + 1
    char = len(msg)
    avgChar = char / words
    X_test_message = np.array([[words, char, avgChar]])
    print(knn.predict(X_test_message))

Train
['author4']
['author14']
Test
['author10']
['author4']
['author4']
['author4']
['author4']
['author5']
['author4']
['author13']
['author4']
['author16']
