In [1]:
# Import libraries

import pandas as pd

# Load data/Noah-Noah-Discord-Conversation.csv

# In this dataset, the message author is given in column 4 (the fifth column),
# and the message content is given in column 15 (the sixteenth column).
# The dataset contains all the direct messages between Noah (diruslupito in the data) and Noah (gamemaster618) on Discord before February 29, 2024.
data = pd.read_csv('data/Alex-Noah-Discord-Conversation.csv')

In [2]:
# Create a new dataframe with cleaned data (no NaN values, no messages with only one word)

dataCleaned = data.dropna(subset=['content'])
dataCleaned = dataCleaned[dataCleaned['content'].apply(lambda x: len(x.split()) > 1)]
display(data.iloc[:, [4, 15]])
display(dataCleaned.iloc[:, [4, 15]])

Unnamed: 0,author.username,content
0,diruslupito,"ok he said yes, ill make a gc"
1,diruslupito,I'll ask my roommate if you can join our 422 g...
2,gamemaster618,Yeah this class is gonna suck
3,diruslupito,Might as well do practice problems
4,gamemaster618,Up front near the middle
...,...,...
520,diruslupito,hope the actual test is more clear
521,diruslupito,i guess it must have been meaning for me to se...
522,diruslupito,"troll question: i thought, and am still prett..."
523,diruslupito,Csc


Unnamed: 0,author.username,content
0,diruslupito,"ok he said yes, ill make a gc"
1,diruslupito,I'll ask my roommate if you can join our 422 g...
2,gamemaster618,Yeah this class is gonna suck
3,diruslupito,Might as well do practice problems
4,gamemaster618,Up front near the middle
...,...,...
518,gamemaster618,Hw 5 Q2 mixing me up rn
519,gamemaster618,Yeah that's kinda scuffed
520,diruslupito,hope the actual test is more clear
521,diruslupito,i guess it must have been meaning for me to se...


In [6]:
# Split up the data into training and testing sets

X = dataCleaned['content']
display(X)
Y = dataCleaned['author.username']
display(Y)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
display(X_train)
display(Y_train)
display(X_test)
display(Y_test)

0                          ok he said yes, ill make a gc
1      I'll ask my roommate if you can join our 422 g...
2                          Yeah this class is gonna suck
3                     Might as well do practice problems
4                               Up front near the middle
                             ...                        
518                              Hw 5 Q2 mixing me up rn
519                            Yeah that's kinda scuffed
520                   hope the actual test is more clear
521    i guess it must have been meaning for me to se...
522    troll question: i thought,  and am still prett...
Name: content, Length: 449, dtype: object

0        diruslupito
1        diruslupito
2      gamemaster618
3        diruslupito
4      gamemaster618
           ...      
518    gamemaster618
519    gamemaster618
520      diruslupito
521      diruslupito
522      diruslupito
Name: author.username, Length: 449, dtype: object

24                                    i think so anyways
85                  my live reaction to this information
218    I'm just gonna go to office hours and ask thou...
301         would be funny if its wrong and it trolls me
44     like when i made a dfa that accepted a single ...
                             ...                        
135                     You go to the lecture for today?
328                                            like what
411    For the test make a cheat sheet so you have a ...
508                    Starting state has an arrow to it
130    this one too since it doesnt say the UTM WONT ...
Name: content, Length: 359, dtype: object

24       diruslupito
85       diruslupito
218    gamemaster618
301      diruslupito
44       diruslupito
           ...      
135    gamemaster618
328      diruslupito
411    gamemaster618
508    gamemaster618
130      diruslupito
Name: author.username, Length: 359, dtype: object

344                                   i have finished it
355             Fair enough\nIt pretty straight forward?
149                                       So D P^n D^n A
409    ~~Assuming tests are still open note online lol~~
92                                     before you edited
                             ...                        
513    I think it means to pick the statment that def...
432                   they didnt comment their algorithm
71      oh i realized, i didnt actually use their string
502         wonder when we will get our test scores back
34     It would be funny if they let the cite go offline
Name: content, Length: 90, dtype: object

344      diruslupito
355    gamemaster618
149    gamemaster618
409    gamemaster618
92       diruslupito
           ...      
513    gamemaster618
432      diruslupito
71       diruslupito
502      diruslupito
34     gamemaster618
Name: author.username, Length: 90, dtype: object

In [8]:
# Create a pipeline and fit it to the data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

KNN_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('knn', KNeighborsClassifier(n_neighbors=5))])
KNN_pipeline.fit(X_train, Y_train)
test_accuracy = accuracy_score(Y_test, KNN_pipeline.predict(X_test))
test_accuracy

0.6

In [9]:
# Use grid search to find the best hyperparameters

from sklearn.model_selection import GridSearchCV

param_grid = {
    "knn__n_neighbors": [1,3,5,7,10]
} 

KNN_tuned_pipeline = GridSearchCV(KNN_pipeline, param_grid)
KNN_tuned_pipeline.fit(X_train,Y_train)


print("Best parameter: {}, CV score = {}:".format(KNN_tuned_pipeline.best_params_,KNN_tuned_pipeline.best_score_))


print("The testing accuracy with the best parameter is: {}".format(accuracy_score(KNN_tuned_pipeline.predict(X_test), Y_test)))

Best parameter: {'knn__n_neighbors': 10}, CV score = 0.6322378716744914:
The testing accuracy with the best parameter is: 0.6111111111111112
