In [1]:
# Import the pandas library to help read the data

import pandas as pd

# Read in the anonymized data from data/anon.csv
# This data can be assumed to already be cleaned

df = pd.read_csv("data/anon.csv")

In [2]:
# Split up the data into training and testing sets

X = df['content']
display(X)
Y = df['author.username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
display(X_train)
display(Y_train)
display(X_test)
display(Y_test)

0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
4       I think it was either me or fable, can't reall...
                              ...                        
6450    Okay so basically I just found out that this A...
6451                                              i think
6452                         Is the 16 before racial mods
6453                                    Aight sounds good
6454          20 point buy can't put a base stat above 16
Name: content, Length: 6455, dtype: object

0       author1
1       author1
2       author2
3       author2
4       author3
         ...   
6450    author5
6451    author6
6452    author4
6453    author4
6454    author5
Name: author.username, Length: 6455, dtype: object

4553                            Westmarch terraria server
2513    2nd thing I have no car + my mom will be at wo...
3049    I would like to acquire a stash of Barbarian chew
1186                                   THAT'S WHAT I SAID
2477                   I have 4 sets of dice for everyone
                              ...                        
3772                      Flavored herbalist or alchemist
5191    Tbf some of your encounter's have sucked ass\n...
5226                                       Summon spells?
5390                          tbf so do animal companions
860     <@&927426307514581042> Today at 6 or tomorrow ...
Name: content, Length: 5164, dtype: object

4553     author2
2513     author4
3049     author4
1186    author10
2477     author5
          ...   
3772     author2
5191     author4
5226     author4
5390     author2
860      author4
Name: author.username, Length: 5164, dtype: object

5030    Yeah the f4 ones been on there a whil I always...
2138    get melee fighter\nteamwork feat fighter\nrang...
2476                         Blue, Green, Red, and Purple
1801                        I don't see anything about it
5738                                In either of the ap's
                              ...                        
1501    And archons do stand in exact opposition to th...
3532                                nah we are sick nasty
5910            Thats much more in line with the minotaur
1703    I'll pick this back up tomorrow when the crowd...
3121    I still think that the feat is better for a lo...
Name: content, Length: 1291, dtype: object

5030     author4
2138     author6
2476     author5
1801    author16
5738     author6
          ...   
1501    author14
3532     author2
5910     author4
1703     author2
3121     author4
Name: author.username, Length: 1291, dtype: object

In [3]:
# Create a pipeline and fit it to the data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Figure out the TfIdfVectorizer's output dimensions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
display(X_train_vectorized.shape)

KNN_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('knn', KNeighborsClassifier(n_neighbors=14))])
KNN_pipeline.fit(X_train, Y_train)
test_accuracy = accuracy_score(Y_test, KNN_pipeline.predict(X_test))
test_accuracy

(5164, 5814)

0.13168086754453912

In [4]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 50):
    param_grid = {
        "knn__n_neighbors": [i]
    } 

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))



Best parameter: {'knn__n_neighbors': 1}, CV score = 0.132
The testing accuracy with the current parameter is: 0.151




Best parameter: {'knn__n_neighbors': 2}, CV score = 0.123
The testing accuracy with the current parameter is: 0.104




Best parameter: {'knn__n_neighbors': 3}, CV score = 0.120
The testing accuracy with the current parameter is: 0.112




Best parameter: {'knn__n_neighbors': 4}, CV score = 0.113
The testing accuracy with the current parameter is: 0.116




Best parameter: {'knn__n_neighbors': 5}, CV score = 0.114
The testing accuracy with the current parameter is: 0.126




Best parameter: {'knn__n_neighbors': 6}, CV score = 0.133
The testing accuracy with the current parameter is: 0.115




Best parameter: {'knn__n_neighbors': 7}, CV score = 0.109
The testing accuracy with the current parameter is: 0.108




Best parameter: {'knn__n_neighbors': 8}, CV score = 0.109
The testing accuracy with the current parameter is: 0.104




Best parameter: {'knn__n_neighbors': 9}, CV score = 0.116
The testing accuracy with the current parameter is: 0.101




Best parameter: {'knn__n_neighbors': 10}, CV score = 0.129
The testing accuracy with the current parameter is: 0.100




Best parameter: {'knn__n_neighbors': 11}, CV score = 0.144
The testing accuracy with the current parameter is: 0.101




Best parameter: {'knn__n_neighbors': 12}, CV score = 0.161
The testing accuracy with the current parameter is: 0.107




Best parameter: {'knn__n_neighbors': 13}, CV score = 0.179
The testing accuracy with the current parameter is: 0.119




Best parameter: {'knn__n_neighbors': 14}, CV score = 0.191
The testing accuracy with the current parameter is: 0.132




Best parameter: {'knn__n_neighbors': 15}, CV score = 0.206
The testing accuracy with the current parameter is: 0.157




Best parameter: {'knn__n_neighbors': 16}, CV score = 0.220
The testing accuracy with the current parameter is: 0.174




Best parameter: {'knn__n_neighbors': 17}, CV score = 0.232
The testing accuracy with the current parameter is: 0.191




Best parameter: {'knn__n_neighbors': 18}, CV score = 0.244
The testing accuracy with the current parameter is: 0.210




Best parameter: {'knn__n_neighbors': 19}, CV score = 0.252
The testing accuracy with the current parameter is: 0.229




Best parameter: {'knn__n_neighbors': 20}, CV score = 0.260
The testing accuracy with the current parameter is: 0.239




Best parameter: {'knn__n_neighbors': 21}, CV score = 0.270
The testing accuracy with the current parameter is: 0.253




Best parameter: {'knn__n_neighbors': 22}, CV score = 0.275
The testing accuracy with the current parameter is: 0.261




Best parameter: {'knn__n_neighbors': 23}, CV score = 0.280
The testing accuracy with the current parameter is: 0.273




Best parameter: {'knn__n_neighbors': 24}, CV score = 0.287
The testing accuracy with the current parameter is: 0.273




Best parameter: {'knn__n_neighbors': 25}, CV score = 0.289
The testing accuracy with the current parameter is: 0.286




Best parameter: {'knn__n_neighbors': 26}, CV score = 0.287
The testing accuracy with the current parameter is: 0.301




Best parameter: {'knn__n_neighbors': 27}, CV score = 0.292
The testing accuracy with the current parameter is: 0.302




Best parameter: {'knn__n_neighbors': 28}, CV score = 0.295
The testing accuracy with the current parameter is: 0.305




Best parameter: {'knn__n_neighbors': 29}, CV score = 0.297
The testing accuracy with the current parameter is: 0.304




Best parameter: {'knn__n_neighbors': 30}, CV score = 0.296
The testing accuracy with the current parameter is: 0.304




Best parameter: {'knn__n_neighbors': 31}, CV score = 0.298
The testing accuracy with the current parameter is: 0.305




Best parameter: {'knn__n_neighbors': 32}, CV score = 0.300
The testing accuracy with the current parameter is: 0.309




Best parameter: {'knn__n_neighbors': 33}, CV score = 0.307
The testing accuracy with the current parameter is: 0.312




Best parameter: {'knn__n_neighbors': 34}, CV score = 0.308
The testing accuracy with the current parameter is: 0.317




Best parameter: {'knn__n_neighbors': 35}, CV score = 0.307
The testing accuracy with the current parameter is: 0.317




Best parameter: {'knn__n_neighbors': 36}, CV score = 0.315
The testing accuracy with the current parameter is: 0.325




Best parameter: {'knn__n_neighbors': 37}, CV score = 0.313
The testing accuracy with the current parameter is: 0.330




Best parameter: {'knn__n_neighbors': 38}, CV score = 0.313
The testing accuracy with the current parameter is: 0.339




Best parameter: {'knn__n_neighbors': 39}, CV score = 0.314
The testing accuracy with the current parameter is: 0.338




Best parameter: {'knn__n_neighbors': 40}, CV score = 0.313
The testing accuracy with the current parameter is: 0.332




Best parameter: {'knn__n_neighbors': 41}, CV score = 0.312
The testing accuracy with the current parameter is: 0.338




Best parameter: {'knn__n_neighbors': 42}, CV score = 0.315
The testing accuracy with the current parameter is: 0.344




Best parameter: {'knn__n_neighbors': 43}, CV score = 0.319
The testing accuracy with the current parameter is: 0.338




Best parameter: {'knn__n_neighbors': 44}, CV score = 0.320
The testing accuracy with the current parameter is: 0.338




Best parameter: {'knn__n_neighbors': 45}, CV score = 0.320
The testing accuracy with the current parameter is: 0.333




Best parameter: {'knn__n_neighbors': 46}, CV score = 0.322
The testing accuracy with the current parameter is: 0.332




Best parameter: {'knn__n_neighbors': 47}, CV score = 0.321
The testing accuracy with the current parameter is: 0.335




Best parameter: {'knn__n_neighbors': 48}, CV score = 0.320
The testing accuracy with the current parameter is: 0.338




Best parameter: {'knn__n_neighbors': 49}, CV score = 0.318
The testing accuracy with the current parameter is: 0.337


In [5]:
# See if the training and testing sets are balanced

display(Y_test.value_counts())

# Create a new dataset by removing all of the authors with less than 20 posts

dfTrimmed = df.groupby('author.username').filter(lambda x: len(x) >= 20)
display(dfTrimmed)

author.username
author4            364
author2            269
author6            149
author5             99
author7             92
author14            74
author13            66
author16            65
author10            37
author1             19
author11            11
author17             7
author15             6
author12             5
author8              5
author25             5
author21             5
author27             3
author.username      2
author29             2
author18             2
author30             2
author28             1
author9              1
Name: count, dtype: int64

Unnamed: 0,author.username,content
0,author1,or did my acid patch thing kill him off my tur...
1,author1,pretty sure it was will but double check
2,author2,I think will
3,author2,Whoever went before Oliver
5,author2,Cause they attacked him after he already died
...,...,...
6450,author5,Okay so basically I just found out that this A...
6451,author6,i think
6452,author4,Is the 16 before racial mods
6453,author4,Aight sounds good


In [6]:
# Split up the data into training and testing sets

X = dfTrimmed['content']
display(X)
Y = dfTrimmed['author.username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
display(X_train)
display(Y_train)
display(X_test)
display(Y_test)


0       or did my acid patch thing kill him off my tur...
1                pretty sure it was will but double check
2                                            I think will
3                              Whoever went before Oliver
5           Cause they attacked him after he already died
                              ...                        
6450    Okay so basically I just found out that this A...
6451                                              i think
6452                         Is the 16 before racial mods
6453                                    Aight sounds good
6454          20 point buy can't put a base stat above 16
Name: content, Length: 6291, dtype: object

0       author1
1       author1
2       author2
3       author2
5       author2
         ...   
6450    author5
6451    author6
6452    author4
6453    author4
6454    author5
Name: author.username, Length: 6291, dtype: object

2495                            pizza hut stuffed crust ðŸ¤¤
1890                     Oh fair I guess that could be it
2308                                  i want more seasons
5420    ~~Figure out how to get that monk ability to k...
6250                                          Banish them
                              ...                        
3869     Finding better modules n shit, making maps, etc.
5335                                   nvm on dayjob then
5371      Still working on who I will be to fit the build
5538                           O fuck I forgot about this
884                                             I am good
Name: content, Length: 5032, dtype: object

2495    author10
1890    author13
2308    author10
5420     author4
6250     author2
          ...   
3869    author13
5335     author2
5371     author2
5538     author6
884      author6
Name: author.username, Length: 5032, dtype: object

650                                 it looks like mine is
1765                         With the exception of Legend
5766    Make him a cleric of Cayden Cailean\nbut use t...
5743                                   He'll take it then
2322                                                I see
                              ...                        
5825    idk bout that last one\nmy body is already a t...
529        Foundry is up if ya'll need to adjust anything
1927    Itâ€™s cause both always die in horrible ways an...
296     He thought there was no gnome because he thoug...
3416                                  This is enlightened
Name: content, Length: 1259, dtype: object

650      author1
1765    author16
5766     author6
5743     author5
2322     author4
          ...   
5825     author6
529      author4
1927     author2
296      author6
3416     author4
Name: author.username, Length: 1259, dtype: object

In [7]:
# Check the value counts to make sure the data is balanced

display(Y_test.value_counts())

# Drop from x and y test until every author has the same number of posts as the smallest author

minPosts = Y_test.value_counts().min()
display(minPosts)

Y_test = Y_test.groupby(Y_test).head(minPosts)
display(Y_test.value_counts())

X_test = X_test[Y_test.index]
display(len(Y_test))
display(len(X_test))

author.username
author4     373
author2     273
author6     154
author5      98
author7      82
author13     58
author16     56
author14     56
author10     53
author1      25
author11     15
author15     10
author12      6
Name: count, dtype: int64

6

author.username
author1     6
author16    6
author6     6
author5     6
author4     6
author2     6
author13    6
author7     6
author10    6
author11    6
author14    6
author12    6
author15    6
Name: count, dtype: int64

78

78

In [8]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 20):
    param_grid = {
        "knn__n_neighbors": [i]
    } 

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))

Best parameter: {'knn__n_neighbors': 1}, CV score = 0.141
The testing accuracy with the current parameter is: 0.218
Best parameter: {'knn__n_neighbors': 2}, CV score = 0.161
The testing accuracy with the current parameter is: 0.218
Best parameter: {'knn__n_neighbors': 3}, CV score = 0.119
The testing accuracy with the current parameter is: 0.205
Best parameter: {'knn__n_neighbors': 4}, CV score = 0.118
The testing accuracy with the current parameter is: 0.192
Best parameter: {'knn__n_neighbors': 5}, CV score = 0.111
The testing accuracy with the current parameter is: 0.205
Best parameter: {'knn__n_neighbors': 6}, CV score = 0.107
The testing accuracy with the current parameter is: 0.192
Best parameter: {'knn__n_neighbors': 7}, CV score = 0.103
The testing accuracy with the current parameter is: 0.205
Best parameter: {'knn__n_neighbors': 8}, CV score = 0.100
The testing accuracy with the current parameter is: 0.205
Best parameter: {'knn__n_neighbors': 9}, CV score = 0.098
The testing ac