In [9]:
# Import the pandas library to help read the data

import pandas as pd

# Read in the anonymized data from data/anon.csv
# This data can be assumed to already be cleaned
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("anon.csv")

Saving anon.csv to anon.csv


In [11]:
 # Split up the data into training and testing sets

X = df['content']
display(X)
Y = df['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
display(X_train)
display(Y_train)
display(X_test)
display(Y_test)

0        or did my acid patch thing kill him off my tur...
1                 pretty sure it was will but double check
2                                             I think will
3                               Whoever went before Oliver
4        I think it was either me or fable, can't reall...
                               ...                        
12906    Okay so basically I just found out that this A...
12907                                              i think
12908                         Is the 16 before racial mods
12909                                    Aight sounds good
12910          20 point buy can't put a base stat above 16
Name: content, Length: 12911, dtype: object

0        author1
1        author1
2        author2
3        author2
4        author3
          ...   
12906    author5
12907    author6
12908    author4
12909    author4
12910    author5
Name: username, Length: 12911, dtype: object

12486    Will just start at the first time without Lawr...
1145     I'd do +2 Dex +2 Cha cause they are already ge...
9537                                    but not a loophole
4387                                     and im all for it
9745                                             Yeah sure
                               ...                        
11964                                 Then it's 34 stealth
5191     Tbf some of your encounter's have sucked ass\r...
5390                           tbf so do animal companions
860      <@&927426307514581042> Today at 6 or tomorrow ...
7270     I school until 700 mon-thurs\r\nAnd have a mee...
Name: content, Length: 10328, dtype: object

12486     author2
1145      author5
9537      author4
4387     author16
9745      author4
           ...   
11964     author6
5191      author4
5390      author2
860       author4
7270      author6
Name: username, Length: 10328, dtype: object

6243                            bird seed and padded armor
8872     Could still do earlier time if we finish before 6
5577                             that was not present info
11068    Then I played BG3 and realized how much it was...
2910                     Randall misunderstood what i said
                               ...                        
1851                You could just make them word riddles.
4808     The group I’m camping with reaaaaly dragged th...
8201                                           Pvp is hard
3424                                        added context:
9926      unsure exactly how it works but we have this now
Name: content, Length: 2583, dtype: object

6243      author5
8872      author4
5577      author5
11068    author16
2910      author5
           ...   
1851     author18
4808     author14
8201     author13
3424      author1
9926      author2
Name: username, Length: 2583, dtype: object

In [12]:
# Create a pipeline and fit it to the data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Figure out the TfIdfVectorizer's output dimensions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
display(X_train_vectorized.shape)

KNN_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('knn', KNeighborsClassifier(n_neighbors=14))])
KNN_pipeline.fit(X_train, Y_train)
test_accuracy = accuracy_score(Y_test, KNN_pipeline.predict(X_test))
test_accuracy

(10328, 6441)

0.10956252419667054

In [13]:
from sklearn.naive_bayes import MultinomialNB

multi_pipline = Pipeline([('tfidf', TfidfVectorizer()), ('MNB', MultinomialNB())])
multi_pipline.fit(X_train, Y_train)
test_accuracy = accuracy_score(Y_test, multi_pipline.predict(X_test))
test_accuracy


0.4099883855981417

In [5]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 50):
    param_grid = {
        "knn__n_neighbors": [i]
    }

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))



Best parameter: {'knn__n_neighbors': 1}, CV score = 0.713
The testing accuracy with the current parameter is: 0.818




Best parameter: {'knn__n_neighbors': 2}, CV score = 0.450
The testing accuracy with the current parameter is: 0.440




KeyboardInterrupt: 

In [16]:
# See if the training and testing sets are balanced

display(Y_test.value_counts())

# Create a new dataset by removing all of the authors with less than 20 posts

dfTrimmed = df.groupby('username').filter(lambda x: len(x) >= 20)

dfTrimmed = dfTrimmed[dfTrimmed.username != 'author5']
dfTrimmed = dfTrimmed[dfTrimmed.username != 'author25']
display(dfTrimmed)

username
author4            710
author2            553
author6            290
author5            215
author7            187
author14           126
author10           114
author13           112
author16           103
author1             55
author11            23
author15            15
author12            13
author17             7
author28             6
author27             6
author29             6
author8              6
author25             6
author9              6
author23             4
author21             4
author.username      4
author18             4
author20             2
author30             2
author24             1
author3              1
author19             1
author31             1
Name: count, dtype: int64

Unnamed: 0,username,content
0,author1,or did my acid patch thing kill him off my tur...
1,author1,pretty sure it was will but double check
2,author2,I think will
3,author2,Whoever went before Oliver
5,author2,Cause they attacked him after he already died
...,...,...
12903,author4,I still haven't even made my character sheet yet
12904,author4,Yeah np
12907,author6,i think
12908,author4,Is the 16 before racial mods


In [17]:
# Split up the data into training and testing sets

X = dfTrimmed['content']
display(X)
Y = dfTrimmed['username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
display(X_train)
display(Y_train)
display(X_test)
display(Y_test)


0        or did my acid patch thing kill him off my tur...
1                 pretty sure it was will but double check
2                                             I think will
3                               Whoever went before Oliver
5            Cause they attacked him after he already died
                               ...                        
12903     I still haven't even made my character sheet yet
12904                                              Yeah np
12907                                              i think
12908                         Is the 16 before racial mods
12909                                    Aight sounds good
Name: content, Length: 11829, dtype: object

0        author1
1        author1
2        author2
3        author2
5        author2
          ...   
12903    author4
12904    author4
12907    author6
12908    author4
12909    author4
Name: username, Length: 11829, dtype: object

1238                              Here's how we fix this
10378                         or would i even be allowed
5989        my character is dogwater at social encounter
9858                             If we ever get that far
10380              she could be Mr. T from resident evil
                              ...                       
12168                                   This spiked mace
5511       Best the worg could get is 14 stealth I think
5733              Unless you get it from a single source
876      Since I forgot to message yall about it earlier
7856                                   had to "nerf" her
Name: content, Length: 9463, dtype: object

1238      author4
10378    author13
5989     author10
9858     author13
10380    author16
           ...   
12168     author2
5511      author2
5733      author2
876       author4
7856      author2
Name: username, Length: 9463, dtype: object

8791                                       Pull the Dalton
916      ||also not permament, just what im working wit...
8690     I like to think he doesn't kill them because t...
8213                       2% battery charge ya damn phone
3027     I ended up going to asheville at 12 and just g...
                               ...                        
11768    ill go through the medium spell list, i forgot...
5221              Next Saturday the plan again? Or Sunday?
7220       There might be a chance I can't show up tonight
12507              Silicon based lifeforms are the problem
11358    <@&868889886839689217> react to let me know wh...
Name: content, Length: 2366, dtype: object

8791      author4
916       author7
8690      author4
8213     author13
3027      author7
           ...   
11768     author1
5221      author4
7220      author6
12507     author4
11358     author2
Name: username, Length: 2366, dtype: object

In [19]:
# Check the value counts to make sure the data is balanced

display(Y_test.value_counts())

# Drop from x and y test until every author has the same number of posts as the smallest author

minPosts = Y_test.value_counts().min()
display(minPosts)

Y_test = Y_test.groupby(Y_test).head(minPosts)
display(Y_test.value_counts())

X_test = X_test[Y_test.index]
display(len(Y_test))
display(len(X_test))

username
author4            3
author21           3
author18           3
author17           3
author12           3
author15           3
author29           3
author27           3
author.username    3
author9            3
author8            3
author7            3
author10           3
author11           3
author16           3
author6            3
author1            3
author14           3
author2            3
author13           3
author28           3
Name: count, dtype: int64

3

username
author4            3
author21           3
author18           3
author17           3
author12           3
author15           3
author29           3
author27           3
author.username    3
author9            3
author8            3
author7            3
author10           3
author11           3
author16           3
author6            3
author1            3
author14           3
author2            3
author13           3
author28           3
Name: count, dtype: int64

63

63

In [None]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

for i in range(1, 20):
    param_grid = {
        "knn__n_neighbors": [i]
    }

    KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
    KNN_tuned_pipeline.fit(X_train,Y_train)


    print("Best parameter: {}, CV score = {:.3f}".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


    print("The testing accuracy with the current parameter is: {:.3f}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))

Best parameter: {'knn__n_neighbors': 1}, CV score = 0.683
The testing accuracy with the current parameter is: 0.841
Best parameter: {'knn__n_neighbors': 2}, CV score = 0.630
The testing accuracy with the current parameter is: 0.762
Best parameter: {'knn__n_neighbors': 3}, CV score = 0.227
The testing accuracy with the current parameter is: 0.238
Best parameter: {'knn__n_neighbors': 4}, CV score = 0.174
The testing accuracy with the current parameter is: 0.206
Best parameter: {'knn__n_neighbors': 5}, CV score = 0.168
The testing accuracy with the current parameter is: 0.190
Best parameter: {'knn__n_neighbors': 6}, CV score = 0.176
The testing accuracy with the current parameter is: 0.190
Best parameter: {'knn__n_neighbors': 7}, CV score = 0.139
The testing accuracy with the current parameter is: 0.143
Best parameter: {'knn__n_neighbors': 8}, CV score = 0.136
The testing accuracy with the current parameter is: 0.143
Best parameter: {'knn__n_neighbors': 9}, CV score = 0.127
The testing ac