In [1]:
# Import the pandas library to help read the data

import pandas as pd

# Read in the anonymized data from data/anon.csv
# This data can be assumed to already be cleaned

df = pd.read_csv("data/anon_merged_alts.csv")

In [2]:
# Split up the data into training and testing sets

X = df['content']
display(X)
Y = df['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
4       I think it was either me or fable, can't reall...
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6441, dtype: object

0       author1
1       author1
2       author2
3       author2
4       author3
         ...   
6436    author2
6437    author6
6438    author4
6439    author4
6440    author2
Name: username, Length: 6441, dtype: object

In [3]:
#Extract alternative features from the data

#Vector of num words
numWordsTrain = []
numWordsTest = []
#Vector of num chars
numCharTrain = []
numCharTest = []
#Vector of avg char/word
avgCharTrain = []
avgCharTest = []

for msg in X_train:
    print(msg)
    #NumWords
    words = msg.count(" ") + 1
    numWordsTrain.append(words)
    #NumChars
    char = len(msg)
    numCharTrain.append(char)
    #AvgChar
    avgCharTrain.append(char / words)
# Display the length of the vectors
print(len(numWordsTrain))
print(len(numCharTrain))
print(len(avgCharTrain))

for msg in X_test:
    #NumWords
    words = msg.count(" ") + 1
    numWordsTest.append(words)
    #NumChars
    char = len(msg)
    numCharTest.append(char)
    #AvgChar
    avgCharTest.append(char / words)

shops will be included
I'll make one tonight. I have yet to sit down and make my anathema and such yet, so it'll all work out and be done tonight. I'll dm a link to ya
You could just walk into room and they have to make perception to know it is you
How long was that bike ride
Goblin campaign, enough said
We should go through places on the way
I was thinking the same thing
It wasnt that he was actually living but that he was treated as being a living creature or something
Who‚Äôs ready for tornado bat
Yeah but idk if asmodeus was in on that bit
For a sec I thought that said "to host my insurance file"
lol lol
My longest has been a slayer in the aformention 1e campaign (Kingmaker)
Level 3 presumably
Yea but it's a steam key
Nvm I could 1 to 4
I'm guessing yes
noah when someone mispells his last name
maybe chance
Kobolds are like hamsters
New map? What?
I think he was referring to the orb having a %
Ah kobold bards must throw down then
I posted a summary of the overall story up to this po

In [4]:
# Add the features to a matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Figure out the TfIdfVectorizer's output dimensions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

display(X_train_vectorized.shape)
display(len(numWordsTrain))

X_train_features = np.column_stack((numWordsTrain, numCharTrain, avgCharTrain, X_train_vectorized.toarray()))
display(X_train_features.shape)
X_test_features = np.column_stack((numWordsTest, numCharTest, avgCharTest, X_test_vectorized.toarray()))
display(X_test_features.shape)

(5152, 5810)

5152

(5152, 5813)

(1289, 5813)

In [5]:
# Fit a KNN model to the data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

for i in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_features, Y_train)

    # Predict the usernames of the test set
    Y_pred = knn.predict(X_test_features)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(Y_test, Y_pred)
    print(accuracy)

0.2156710628394104
0.19860356865787432
0.20791311093871218
0.21799844840961985
0.24049650892164467
0.24359968968192397
0.26066718386346005
0.2583397982932506
0.2637703646237393
0.26764934057408846
0.26764934057408846
0.27773467804499613
0.2761830876648565
0.28549262994569435
0.29014740108611325
0.2924747866563227
0.28626842513576417
0.2808378588052754
0.2816136539953452
0.2761830876648565
0.2839410395655547
0.2808378588052754
0.28937160589604344
0.2940263770364624
0.2885958107059736
0.2924747866563227
0.2878200155159038
0.28471683475562454
0.27928626842513576
0.2816136539953452
0.273855702094647
0.2800620636152056
0.2730799069045772
0.2761830876648565
0.27851047323506595
0.26764934057408846
0.2699767261442979
0.2637703646237393
0.273855702094647
0.27773467804499613
0.273855702094647
0.2769588828549263
0.27928626842513576
0.27928626842513576
0.2761830876648565
0.27230411171450736
0.27230411171450736
0.2684251357641583
0.27230411171450736
0.2746314972847168


In [6]:
# See if the training and testing sets are balanced

display(Y_test.value_counts())

# Create a new dataset by removing all of the authors with less than 20 posts

dfTrimmed = df.groupby('username').filter(lambda x: len(x) >= 20)
display(dfTrimmed)

username
author4     383
author2     355
author6     138
author7      92
author14     74
author13     73
author16     54
author10     43
author1      30
author11     11
author15     11
author8       5
author12      3
author17      3
author18      2
author30      2
author24      2
author9       2
author27      1
author3       1
author21      1
author29      1
author23      1
author19      1
Name: count, dtype: int64

Unnamed: 0,username,content
0,author1,or did my acid patch thing kill him off my tur...
1,author1,pretty sure it was will but double check
2,author2,I think will
3,author2,Whoever went before Oliver
5,author2,Cause they attacked him after he already died
...,...,...
6436,author2,Okay so basically I just found out that this A...
6437,author6,i think
6438,author4,Is the 16 before racial mods
6439,author4,Aight sounds good


In [7]:
# Split up the data into training and testing sets

X = dfTrimmed['content']
display(X)
Y = dfTrimmed['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
5           Cause they attacked him after he already died
                              ...                        
6436    Okay so basically I just found out that this A...
6437                                              i think
6438                         Is the 16 before racial mods
6439                                    Aight sounds good
6440          20 point buy can't put a base stat above 16
Name: content, Length: 6310, dtype: object

0       author1
1       author1
2       author2
3       author2
5       author2
         ...   
6436    author2
6437    author6
6438    author4
6439    author4
6440    author2
Name: username, Length: 6310, dtype: object

In [8]:
# Check the value counts to make sure the data is balanced

display(Y_test.value_counts())

# Drop from x and y test until every author has the same number of posts as the author with the least posts

minPosts = Y_test.value_counts().min()
display(minPosts)

Y_test = Y_test.groupby(Y_test).head(minPosts)
display(Y_test.value_counts())

X_test = X_test[Y_test.index]
display(len(Y_test))
display(len(X_test))

username
author4     371
author2     361
author6     151
author7      86
author13     63
author14     61
author16     52
author10     51
author1      30
author11     15
author15     12
author12      9
Name: count, dtype: int64

9

username
author2     9
author4     9
author7     9
author15    9
author14    9
author10    9
author1     9
author16    9
author6     9
author13    9
author12    9
author11    9
Name: count, dtype: int64

108

108

In [9]:
#Extract alternative features from the data

#Vector of num words
numWordsTrain = []
numWordsTest = []
#Vector of num chars
numCharTrain = []
numCharTest = []
#Vector of avg char/word
avgCharTrain = []
avgCharTest = []

for msg in X_train:
    print(msg)
    #NumWords
    words = msg.count(" ") + 1
    numWordsTrain.append(words)
    #NumChars
    char = len(msg)
    numCharTrain.append(char)
    #AvgChar
    avgCharTrain.append(char / words)
# Display the length of the vectors
print(len(numWordsTrain))
print(len(numCharTrain))
print(len(avgCharTrain))

for msg in X_test:
    #NumWords
    words = msg.count(" ") + 1
    numWordsTest.append(words)
    #NumChars
    char = len(msg)
    numCharTest.append(char)
    #AvgChar
    avgCharTest.append(char / words)

did that a few times last night
It bathroom one sec
Roll off then mfs
Fighters finess is a little nuts
foundry will be up overnight
Pretty much
they're legit
5000 for bag of holding IV (can also store stuff like the cauldron and the tears)
700 onyx for one eye, 75 onyx for fast zombies, 350 onyx for Atlas, and 200 for Carrion
200 to convert the portable hole into a botany/alchemist lab + crafting shop
200 for an altar to Asmodeus to buff these undead when I make them (+2 hp per hd)
Foundry is up if ya'll need to adjust anything
~~Your mother after I finished with her last night~~
Whatever war the found out about with the Goblin
Sending da orphan to college
Lich Phylactery shards and/or the orb
He thought there was no gnome because he thought I was busy when I was not busy
No, it is evolving
maps + non-custom monsters down
You can thank the star god for that
A lot of level 2s so I imagine more will come up soon
no crits
I think if the attack penalties only started at wounded and c

In [10]:
# Add the features to a matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Figure out the TfIdfVectorizer's output dimensions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

display(X_train_vectorized.shape)
display(len(numWordsTrain))

X_train_features = np.column_stack((numWordsTrain, numCharTrain, avgCharTrain, X_train_vectorized.toarray()))
display(X_train_features.shape)
X_test_features = np.column_stack((numWordsTest, numCharTest, avgCharTest, X_test_vectorized.toarray()))
display(X_test_features.shape)

(5048, 5815)

5048

(5048, 5818)

(108, 5818)

In [11]:
# Fit a KNN model to the data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

for i in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_features, Y_train)

    # Predict the usernames of the test set
    Y_pred = knn.predict(X_test_features)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(Y_test, Y_pred)
    print(accuracy)

0.21296296296296297
0.2222222222222222
0.2037037037037037
0.19444444444444445
0.18518518518518517
0.18518518518518517
0.17592592592592593
0.17592592592592593
0.1574074074074074
0.17592592592592593
0.16666666666666666
0.18518518518518517
0.16666666666666666
0.17592592592592593
0.17592592592592593
0.16666666666666666
0.16666666666666666
0.1574074074074074
0.1574074074074074
0.16666666666666666
0.14814814814814814
0.14814814814814814
0.14814814814814814
0.1388888888888889
0.1388888888888889
0.12962962962962962
0.12962962962962962
0.14814814814814814
0.14814814814814814
0.14814814814814814
0.12962962962962962
0.1388888888888889
0.1574074074074074
0.14814814814814814
0.16666666666666666
0.14814814814814814
0.12962962962962962
0.12962962962962962
0.1388888888888889
0.1574074074074074
0.12962962962962962
0.1388888888888889
0.12962962962962962
0.14814814814814814
0.1574074074074074
0.1388888888888889
0.12962962962962962
0.14814814814814814
0.12037037037037036
0.1388888888888889
