In [17]:
!pip install pandas



In [18]:
# import stuff
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [19]:
pcf_gen = pd.read_csv("./Data/pcf-general.csv")
spl_gen_1 = pd.read_csv("./Data/spl-gen-1.csv")
spl_gen_2 = pd.read_csv("./Data/spl-gen-2.csv")
spl_gen = pd.concat([spl_gen_1, spl_gen_2], axis=0, ignore_index=True)

In [20]:
authorID = list(set(pcf_gen['AuthorID']).intersection(set(spl_gen['AuthorID'])))

In [21]:
# this is our training set, AuthorID is the label and Content is the feature
pcf_gen_common = pcf_gen[pcf_gen['AuthorID'].isin(authorID)]
pcf_gen_common = pcf_gen_common.drop(['Reactions', 'Attachments'], axis=1)

In [22]:
# create new txt file with content of each author
for i in authorID:
    with open('./Data/pcf_gen_common/'+str(i)+'.txt', 'w') as f:
        message = pcf_gen_common[pcf_gen_common['AuthorID'] == i]['Content'].str.cat(sep=' ')
        # check if message has any character that cannot be encoded
        # if yes, remove that character
        # if no, write the message to the file

        try:
            f.write(message)
        except UnicodeEncodeError:
            message = message.encode('ascii', 'ignore').decode('ascii')
            f.write(message)
    f.close()


In [23]:
# create new txt file with content of each author
for i in authorID:
    with open('./Data/spl_gen/'+str(i)+'.txt', 'w') as f:
        message = spl_gen[spl_gen['AuthorID'] == i]['Content'].str.cat(sep=' ')
        # check if message has any character that cannot be encoded
        # if yes, remove that character
        # if no, write the message to the file

        try:
            f.write(message)
        except UnicodeEncodeError:
            message = message.encode('ascii', 'ignore').decode('ascii')
            f.write(message)
    f.close()

In [24]:
# Train data: pcf_gen_common
# Test data: spl_gen

# import stuff
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [25]:
# read the training data

# create a list of all the files in the folder
files = os.listdir('./Data/pcf_gen_common/')
# create a list of all the file names
file_names = [file.split('.')[0] for file in files]
# create a list of all the file paths
file_paths = ['./Data/pcf_gen_common/'+file for file in files]

# create a dataframe with file names and file paths
df = pd.DataFrame({'AuthorID': file_names, 'File_Path': file_paths})

# read the content of each file and store it in a list
content = []
for file in file_paths:
    with open(file, 'r') as f:
        content.append(f.read())
    f.close()

# add the content to the dataframe
df['Content'] = content


In [26]:
# read the test data

# create a list of all the files in the folder
files = os.listdir('./Data/spl_gen/')
# create a list of all the file names
file_names = [file.split('.')[0] for file in files]
# create a list of all the file paths
file_paths = ['./Data/spl_gen/'+file for file in files]

# create a dataframe with file names and file paths
df_test = pd.DataFrame({'AuthorID': file_names, 'File_Path': file_paths})

# read the content of each file and store it in a list
content = []
for file in file_paths:
    with open(file, 'r') as f:
        content.append(f.read())
    f.close()

# add the content to the dataframe
df_test['Content'] = content



In [27]:
# Feature set: word n-grams
# Label set: AuthorID
# Model: Naive Bayes
# ML Task 1

#class balancing of the training data
df['AuthorID'].value_counts()


# create a list of all the authors
authors = list(df['AuthorID'].unique())
# create a list of all the authors
authors_test = list(df_test['AuthorID'].unique())

# # create the feature set for the training data
# feature_set = create_feature_set(df)
# # create the feature set for the test data
# feature_set_test = create_feature_set(df_test)

# # create the label set
# label_set = df['AuthorID']

# # create the vectorizer
# vectorizer = CountVectorizer()
# # fit the vectorizer to the training data
# vectorizer.fit(feature_set)
# # transform the training data
# X = vectorizer.transform(feature_set)
# # transform the test data
# X_test = vectorizer.transform(feature_set_test)

In [28]:
# #resample the training data to balance the classes
# df_balanced = pd.DataFrame(columns=['AuthorID', 'File_Path', 'Content'])
# for author in authors:
#     df_balanced = df_balanced.append(df[df['AuthorID'] == author].sample(n=1000, replace=True), ignore_index=True)



In [29]:
# #resample the test data to balance the classes
# df_test_balanced = pd.DataFrame(columns=['AuthorID', 'File_Path', 'Content'])
# for author in authors_test:
#     df_test_balanced = df_test_balanced.append(df_test[df_test['AuthorID'] == author].sample(n=1000, replace=True), ignore_index=True)


In [30]:
#range of length of messages in the training data
df['Content'].str.len().describe()

count    6.700000e+01
mean     1.097180e+05
std      2.385872e+05
min      1.200000e+01
25%      3.640000e+02
50%      5.845000e+03
75%      8.639950e+04
max      1.246120e+06
Name: Content, dtype: float64

# Character n-grams

In [76]:
# create the feature set using 1000 most frequently ranked character n-grams
c_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,3), max_features=1000)
# fit the vectorizer to the training data
c_vectorizer.fit(df['Content'])
# transform the training data
X = c_vectorizer.transform(df['Content'])

# create the label set
y = df['AuthorID']



In [77]:
X_test = c_vectorizer.transform(df_test['Content'])
y_test = df_test['AuthorID']

In [78]:
# y = label_set
# y_test = df_test['AuthorID']
def MNB(X, y, X_test, y_test):
    # create the model
    model = MultinomialNB()
    # fit the model to the training data
    model.fit(X, y)
    # predict the labels for the test data
    y_pred = model.predict(X_test)

    # # evaluate the model using accuracy percentage
    # print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

    # #evaluate the model using f1 score
    # from sklearn.metrics import f1_score
    # print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

    # #evaluate the model using precision score
    # from sklearn.metrics import precision_score
    # print('Precision Score: ', precision_score(y_test, y_pred, average='weighted'))

    # #evaluate the model using recall score
    # from sklearn.metrics import recall_score
    # print('Recall Score: ', recall_score(y_test, y_pred, average='weighted'))

    return accuracy_score(y_test, y_pred)*100, f1_score(y_test, y_pred, average='weighted'), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted')



# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()


Accuracy:  34.32835820895522 %
F1 Score:  0.23712208189820128


In [67]:
#Model: KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# create the model
model = KNeighborsClassifier(n_neighbors=5)
# fit the model to the training data
model.fit(X, y)
# predict the labels for the test data
y_pred = model.predict(X_test)

# evaluate the model using accuracy percentage
print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

#evaluate the model using f1 score
from sklearn.metrics import f1_score
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

Accuracy:  2.9850746268656714 %
F1 Score:  0.01243781094527363


In [56]:
#Model: SVM

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# create the model
model = SVC(kernel='linear')
# fit the model to the training data
model.fit(X, y)
# predict the labels for the test data
y_pred = model.predict(X_test)

# evaluate the model using accuracy percentage
print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

#evaluate the model using f1 score
from sklearn.metrics import f1_score
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()


Accuracy:  7.462686567164178 %
F1 Score:  0.06105834464043419


In [68]:
#Model: Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# create the model
model = DecisionTreeClassifier()
# fit the model to the training data
model.fit(X, y)
# predict the labels for the test data
y_pred = model.predict(X_test)

# evaluate the model using accuracy percentage
print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

#evaluate the model using f1 score
from sklearn.metrics import f1_score
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()


Accuracy:  4.477611940298507 %
F1 Score:  0.022174840085287847


In [74]:
#Model: Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# create the model
model = RandomForestClassifier(n_estimators=100)
# fit the model to the training data
model.fit(X, y)
# predict the labels for the test data
y_pred = model.predict(X_test)

# evaluate the model using accuracy percentage
print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

#evaluate the model using f1 score
from sklearn.metrics import f1_score
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()


Accuracy:  16.417910447761194 %
F1 Score:  0.1189054726368159


In [75]:
#Model: Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create the model
model = LogisticRegression()
# fit the model to the training data
model.fit(X, y)
# predict the labels for the test data
y_pred = model.predict(X_test)

# evaluate the model using accuracy percentage
print('Accuracy: ', accuracy_score(y_test, y_pred)*100, '%')

#evaluate the model using f1 score
from sklearn.metrics import f1_score
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# # plot the confusion matrix
# plt.figure(figsize=(10, 10))
# sns.heatmap(cm, annot=True, fmt='d')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

Accuracy:  22.388059701492537 %
F1 Score:  0.16823124198014452
