In [1]:
# Import the pandas library to help read the data

import pandas as pd

# Read in the anonymized data from data/anon.csv
# This data can be assumed to already be cleaned

df = pd.read_csv("data/anon_merged_alts.csv")

In [2]:
# Split up the data into training and testing sets

X = df['content']
display(X)
Y = df['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
for msg in X_train:
    print(msg)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
4       I think it was either me or fable, can't reall...
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6441, dtype: object

0       author1
1       author1
2       author2
3       author2
4       author3
         ...   
6436    author2
6437    author6
6438    author4
6439    author4
6440    author2
Name: username, Length: 6441, dtype: object

shops will be included
I'll make one tonight. I have yet to sit down and make my anathema and such yet, so it'll all work out and be done tonight. I'll dm a link to ya
You could just walk into room and they have to make perception to know it is you
How long was that bike ride
Goblin campaign, enough said
We should go through places on the way
I was thinking the same thing
It wasnt that he was actually living but that he was treated as being a living creature or something
Who‚Äôs ready for tornado bat
Yeah but idk if asmodeus was in on that bit
For a sec I thought that said "to host my insurance file"
lol lol
My longest has been a slayer in the aformention 1e campaign (Kingmaker)
Level 3 presumably
Yea but it's a steam key
Nvm I could 1 to 4
I'm guessing yes
noah when someone mispells his last name
maybe chance
Kobolds are like hamsters
New map? What?
I think he was referring to the orb having a %
Ah kobold bards must throw down then
I posted a summary of the overall story up to this po

In [3]:
# Create a pipeline and fit it to the data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Figure out the TfIdfVectorizer's output dimensions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
display(X_train_vectorized.shape)

KNN_pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=3)), ('knn', KNeighborsClassifier(n_neighbors=1))])
KNN_pipeline.fit(X_train, Y_train)
test_accuracy = accuracy_score(Y_test, KNN_pipeline.predict(X_test))
test_accuracy

(5152, 5810)

0.3110938712179985

In [4]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 51):
    param_grid = {
        "knn__n_neighbors": [i]
    } 

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))



Best parameter: {'knn__n_neighbors': 1}, CV score = 0.295
The testing accuracy with the current parameter is: 0.311




Best parameter: {'knn__n_neighbors': 2}, CV score = 0.145
The testing accuracy with the current parameter is: 0.119




Best parameter: {'knn__n_neighbors': 3}, CV score = 0.159
The testing accuracy with the current parameter is: 0.129




Best parameter: {'knn__n_neighbors': 4}, CV score = 0.161
The testing accuracy with the current parameter is: 0.130




Best parameter: {'knn__n_neighbors': 5}, CV score = 0.179
The testing accuracy with the current parameter is: 0.151




Best parameter: {'knn__n_neighbors': 6}, CV score = 0.209
The testing accuracy with the current parameter is: 0.179




Best parameter: {'knn__n_neighbors': 7}, CV score = 0.225
The testing accuracy with the current parameter is: 0.209




Best parameter: {'knn__n_neighbors': 8}, CV score = 0.204
The testing accuracy with the current parameter is: 0.192




Best parameter: {'knn__n_neighbors': 9}, CV score = 0.199
The testing accuracy with the current parameter is: 0.184




Best parameter: {'knn__n_neighbors': 10}, CV score = 0.193
The testing accuracy with the current parameter is: 0.174




Best parameter: {'knn__n_neighbors': 11}, CV score = 0.197
The testing accuracy with the current parameter is: 0.187




Best parameter: {'knn__n_neighbors': 12}, CV score = 0.202
The testing accuracy with the current parameter is: 0.233




Best parameter: {'knn__n_neighbors': 13}, CV score = 0.192
The testing accuracy with the current parameter is: 0.230




Best parameter: {'knn__n_neighbors': 14}, CV score = 0.186
The testing accuracy with the current parameter is: 0.232




Best parameter: {'knn__n_neighbors': 15}, CV score = 0.181
The testing accuracy with the current parameter is: 0.228




Best parameter: {'knn__n_neighbors': 16}, CV score = 0.181
The testing accuracy with the current parameter is: 0.233




Best parameter: {'knn__n_neighbors': 17}, CV score = 0.181
The testing accuracy with the current parameter is: 0.219




Best parameter: {'knn__n_neighbors': 18}, CV score = 0.180
The testing accuracy with the current parameter is: 0.233




Best parameter: {'knn__n_neighbors': 19}, CV score = 0.177
The testing accuracy with the current parameter is: 0.224




Best parameter: {'knn__n_neighbors': 20}, CV score = 0.173
The testing accuracy with the current parameter is: 0.189




Best parameter: {'knn__n_neighbors': 21}, CV score = 0.178
The testing accuracy with the current parameter is: 0.181




Best parameter: {'knn__n_neighbors': 22}, CV score = 0.177
The testing accuracy with the current parameter is: 0.175




Best parameter: {'knn__n_neighbors': 23}, CV score = 0.182
The testing accuracy with the current parameter is: 0.179




Best parameter: {'knn__n_neighbors': 24}, CV score = 0.192
The testing accuracy with the current parameter is: 0.177




Best parameter: {'knn__n_neighbors': 25}, CV score = 0.195
The testing accuracy with the current parameter is: 0.178




Best parameter: {'knn__n_neighbors': 26}, CV score = 0.200
The testing accuracy with the current parameter is: 0.172




Best parameter: {'knn__n_neighbors': 27}, CV score = 0.211
The testing accuracy with the current parameter is: 0.206




Best parameter: {'knn__n_neighbors': 28}, CV score = 0.237
The testing accuracy with the current parameter is: 0.191




Best parameter: {'knn__n_neighbors': 29}, CV score = 0.258
The testing accuracy with the current parameter is: 0.197




Best parameter: {'knn__n_neighbors': 30}, CV score = 0.267
The testing accuracy with the current parameter is: 0.198




Best parameter: {'knn__n_neighbors': 31}, CV score = 0.271
The testing accuracy with the current parameter is: 0.194




Best parameter: {'knn__n_neighbors': 32}, CV score = 0.284
The testing accuracy with the current parameter is: 0.194




Best parameter: {'knn__n_neighbors': 33}, CV score = 0.288
The testing accuracy with the current parameter is: 0.196




Best parameter: {'knn__n_neighbors': 34}, CV score = 0.286
The testing accuracy with the current parameter is: 0.202




Best parameter: {'knn__n_neighbors': 35}, CV score = 0.284
The testing accuracy with the current parameter is: 0.203




Best parameter: {'knn__n_neighbors': 36}, CV score = 0.286
The testing accuracy with the current parameter is: 0.202




Best parameter: {'knn__n_neighbors': 37}, CV score = 0.287
The testing accuracy with the current parameter is: 0.154




Best parameter: {'knn__n_neighbors': 38}, CV score = 0.279
The testing accuracy with the current parameter is: 0.190




Best parameter: {'knn__n_neighbors': 39}, CV score = 0.276
The testing accuracy with the current parameter is: 0.260




Best parameter: {'knn__n_neighbors': 40}, CV score = 0.279
The testing accuracy with the current parameter is: 0.262




Best parameter: {'knn__n_neighbors': 41}, CV score = 0.271
The testing accuracy with the current parameter is: 0.261




Best parameter: {'knn__n_neighbors': 42}, CV score = 0.276
The testing accuracy with the current parameter is: 0.276




Best parameter: {'knn__n_neighbors': 43}, CV score = 0.267
The testing accuracy with the current parameter is: 0.282




Best parameter: {'knn__n_neighbors': 44}, CV score = 0.266
The testing accuracy with the current parameter is: 0.282




Best parameter: {'knn__n_neighbors': 45}, CV score = 0.260
The testing accuracy with the current parameter is: 0.281




Best parameter: {'knn__n_neighbors': 46}, CV score = 0.255
The testing accuracy with the current parameter is: 0.285




Best parameter: {'knn__n_neighbors': 47}, CV score = 0.259
The testing accuracy with the current parameter is: 0.287




Best parameter: {'knn__n_neighbors': 48}, CV score = 0.255
The testing accuracy with the current parameter is: 0.287




Best parameter: {'knn__n_neighbors': 49}, CV score = 0.252
The testing accuracy with the current parameter is: 0.284




Best parameter: {'knn__n_neighbors': 50}, CV score = 0.253
The testing accuracy with the current parameter is: 0.284


In [5]:
# See if the training and testing sets are balanced

display(Y_test.value_counts())

# Create a new dataset by removing all of the authors with less than 20 posts

dfTrimmed = df.groupby('username').filter(lambda x: len(x) >= 20)
display(dfTrimmed)

username
author4     383
author2     355
author6     138
author7      92
author14     74
author13     73
author16     54
author10     43
author1      30
author11     11
author15     11
author8       5
author12      3
author17      3
author18      2
author30      2
author24      2
author9       2
author27      1
author3       1
author21      1
author29      1
author23      1
author19      1
Name: count, dtype: int64

Unnamed: 0,username,content
0,author1,or did my acid patch thing kill him off my tur...
1,author1,pretty sure it was will but double check
2,author2,I think will
3,author2,Whoever went before Oliver
5,author2,Cause they attacked him after he already died
...,...,...
6436,author2,Okay so basically I just found out that this A...
6437,author6,i think
6438,author4,Is the 16 before racial mods
6439,author4,Aight sounds good


In [6]:
# Split up the data into training and testing sets

X = dfTrimmed['content']
display(X)
Y = dfTrimmed['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
for msg in X_test:
    print(msg)


0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
5           Cause they attacked him after he already died
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6310, dtype: object

0       author1
1       author1
2       author2
3       author2
5       author2
         ...   
6436    author2
6437    author6
6438    author4
6439    author4
6440    author2
Name: username, Length: 6310, dtype: object

I guess I should maybe not make this character in that case
Still trying to but unlikely
you are grandparents
Big agree
Because it gives them resistance greater than the damage you're dealing
Fable what you end up doing with tgat lab
I'll take a look
Oh I just realized Coral Armor is a thing
ask to read every book you come across >:)
Gotcha
Guess we'll just see how it goes
we each make wrestling characters and do a tournament
What was that
~~It's because I have no clothes~~
I just didn't do it yesterday
i just lose on this one ig
Summon spells?
Not Including today ofc
Just to make sure Ive preped correctly you guys are planning on finishing the forge this session?
Sell chainshirts, shortswords, longswords, and maybe +1 hide armor
oh yo sure!!
thats a heritage?
I go home next weekend
The issue might be beacuse you are on mobile
Mobile has funky ping issues
What is it
<@&828644080443326494> might not be a bad idea to divvy out some money for furnishing rooms
Bruh land speed effects acrob

In [7]:
# Check the value counts to make sure the data is balanced

display(Y_test.value_counts())

# Drop from x and y test until every author has the same number of posts as the smallest author

minPosts = Y_test.value_counts().min()
display(minPosts)

Y_test = Y_test.groupby(Y_test).head(minPosts)
display(Y_test.value_counts())

X_test = X_test[Y_test.index]
display(len(Y_test))
display(len(X_test))

username
author4     371
author2     361
author6     151
author7      86
author13     63
author14     61
author16     52
author10     51
author1      30
author11     15
author15     12
author12      9
Name: count, dtype: int64

9

username
author2     9
author4     9
author7     9
author15    9
author14    9
author10    9
author1     9
author16    9
author6     9
author13    9
author12    9
author11    9
Name: count, dtype: int64

108

108

In [8]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 51):
    param_grid = {
        "knn__n_neighbors": [i]
    } 

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))

Best parameter: {'knn__n_neighbors': 1}, CV score = 0.277
The testing accuracy with the current parameter is: 0.194
Best parameter: {'knn__n_neighbors': 2}, CV score = 0.257
The testing accuracy with the current parameter is: 0.213
Best parameter: {'knn__n_neighbors': 3}, CV score = 0.239
The testing accuracy with the current parameter is: 0.204
Best parameter: {'knn__n_neighbors': 4}, CV score = 0.252
The testing accuracy with the current parameter is: 0.194
Best parameter: {'knn__n_neighbors': 5}, CV score = 0.256
The testing accuracy with the current parameter is: 0.176
Best parameter: {'knn__n_neighbors': 6}, CV score = 0.291
The testing accuracy with the current parameter is: 0.185
Best parameter: {'knn__n_neighbors': 7}, CV score = 0.284
The testing accuracy with the current parameter is: 0.157
Best parameter: {'knn__n_neighbors': 8}, CV score = 0.269
The testing accuracy with the current parameter is: 0.194
Best parameter: {'knn__n_neighbors': 9}, CV score = 0.242
The testing ac

In [9]:
KNN_tuned_pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=3)), ('knn', KNeighborsClassifier(n_neighbors=1))])
KNN_tuned_pipeline.fit(X_train, Y_train)
# See how the model predicts the usernames of a small set of test messages
# This first message is from author4, the model should get this right since this comes from the training set
print("Train")
message = "Foundry is up if ya'll need to adjust anything"
print(KNN_tuned_pipeline.predict([message]))
# This message is from author6, the model should get this right since this comes from the training set
message = "He thought there was no gnome because he thought I was busy when I was not busy"
print(KNN_tuned_pipeline.predict([message]))
# These next 10 messages are not in the training set, so the model will have to predict the author
# The authors of these messages are written to the right of the message
messages = ["I guess I should maybe not make this character in that case", # 2
 "instead of me having to keep up with DMs, head canons, and docs", # 2
 "Which is why I'm confused at this site I've never seen before lol", # 13
 "fyi if theres pathfinder i cant do it today", # 6
 "Gonna be a little late was studying in the library with sime friends and 1 physics problem took 2 hours", # 4
 "He has spirit on his base attack too though so should be fine", # 4
 "If we go to the capital the buy price may be high enough for fervent though", # 4
 "nah humans dont have a limit to their amount of levels", # 6
 "tldr; I have a giant crab and I hit slightly harder than I did before", # 14
 "Cuz I swear at the end of whatever it was  you were like oh yeah you leveled up" # 7
]
print("Test")

for msg in messages:
    print(KNN_tuned_pipeline.predict([msg]))

Train
['author4']
['author6']
Test
['author2']
['author2']
['author2']
['author7']
['author2']
['author2']
['author4']
['author2']
['author2']
['author2']
