In [1]:
import pandas as pd

In [2]:
# Read dataframes
comments_df= pd.read_json("all_comments.json")
replies_df = pd.read_json("all_replies.json")


In [3]:
print(comments_df["translation"].head())
print(replies_df["translation"].head())

0    <b>Dear friends, I have already put the first ...
1    How to scream into the forest... what kind of ...
2                     EQS = Embarrassing Quality Shit.
3    Reminds me of a personal story I had with a ma...
4    I can understand you well. There is probably n...
Name: translation, dtype: object
0    Regarding donations, especially money... the c...
1    @MasterMind187 right Customer is customer, so ...
2    You should always start with something positiv...
3                  Good action - 100 € from me for it.
4    I will share the video on the intranet, curiou...
Name: translation, dtype: object


In [4]:
print(comments_df.shape)
print(replies_df.shape)

(562228, 13)
(430227, 12)


In [5]:
# Remove all rows where the translation string is less than 10 characters long
mask = (comments_df['translation'].str.len() > 10)
comments_df = comments_df.loc[mask]

mask = (replies_df['translation'].str.len() > 10)
replies_df = replies_df.loc[mask]

In [6]:
print(comments_df.shape)
print(replies_df.shape)

(550286, 13)
(417054, 12)


In [7]:
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [8]:
labeled_dataset = pd.read_csv("spam_ham/spam_ham-UCI.csv")

In [9]:
print(labeled_dataset.head)

<bound method NDFrame.head of                                        COMMENT_ID                AUTHOR  \
0     LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU             Julius NM   
1     LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A           adam riyati   
2     LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8      Evgeny Murashkin   
3             z13jhp0bxqncu512g22wvzkasxmvvzjaz04       ElNino Melendez   
4             z13fwbwp1oujthgqj04chlngpvzmtt3r3dw                GsMega   
...                                           ...                   ...   
1951  _2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA          Katie Mettam   
1952  _2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI  Sabina Pearson-Smith   
1953  _2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs         jeffrey jules   
1954  _2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0        Aishlin Maciel   
1955  _2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA           Latin Bosch   

                            DATE  \
0            2013-11-07T06:20:48 

In [10]:
content = labeled_dataset["CONTENT"]
label = labeled_dataset["CLASS"]

In [11]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(content,label,test_size=0.3)

In [12]:
# Transform categorial data of the string type into numerical values for the model
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [13]:
# Term frequency vectorizer summarizing how often a given word appears within a document
Tfidf_vect = TfidfVectorizer(max_features=4454)
Tfidf_vect.fit(content)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [14]:
print(Tfidf_vect.vocabulary_)



In [15]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(precision_recall_fscore_support(predictions_SVM, Test_Y, average='micro'))

SVM Accuracy Score ->  96.42248722316864
(0.9642248722316865, 0.9642248722316865, 0.9642248722316865, None)


In [16]:
# Transform categorial data of the string type into numerical values for the model
comments_to_predict = Encoder.fit_transform(comments_df["translation"])
replies_to_predict = Encoder.fit_transform(replies_df["translation"])

In [17]:
# For comments: Term frequency vectorizer summarizing how often a given word appears within a document (4454 = max features of the training data)
Tfidf_vect = TfidfVectorizer(max_features=4454)
Comments_Tfidf = Tfidf_vect.fit_transform(comments_df["translation"])

print(Comments_Tfidf.shape)

(550286, 4454)


In [18]:
comments_SVM = SVM.predict(Comments_Tfidf)

print(len(comments_SVM))

550286


In [19]:
comments_SVM_df = pd.DataFrame(comments_SVM, columns = ['label'])

In [20]:
comments_SVM_df.loc[comments_SVM_df['label'] == 0, 'label'] = "ham"
comments_SVM_df.loc[comments_SVM_df['label'] == 1, 'label'] = "spam"

In [21]:
print(len(comments_df))
print(len(comments_SVM))

550286
550286


In [22]:
# For replies: Term frequency vectorizer summarizing how often a given word appears within a document (4454 = max features of the training data)
Tfidf_vect = TfidfVectorizer(max_features=4454)
Tfidf_vect.fit(replies_df["translation"])
Replies_Tfidf = Tfidf_vect.transform(replies_df["translation"])

In [23]:
replies_SVM = SVM.predict(Replies_Tfidf)

print(replies_SVM)

[0 0 1 ... 0 0 0]


In [24]:
replies_SVM_df = pd.DataFrame(replies_SVM, columns = ['label'])

print(replies_SVM_df.shape)

(417054, 1)


In [25]:
replies_SVM_df.loc[replies_SVM_df['label'] == 0, 'label'] = "ham"
replies_SVM_df.loc[replies_SVM_df['label'] == 1, 'label'] = "spam"

In [26]:
import re

In [27]:
# Remove HTML tags from translation string
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [28]:
print(comments_df)
comments_df['translation'] = comments_df.apply(lambda x: cleanhtml(x['translation']), axis=1)
replies_df['translation'] = replies_df.apply(lambda x: cleanhtml(x['translation']), axis=1)

        table_id     video_id  job page       date          author  likes  \
0          10061  kjwiFMxRotQ   85      2022-03-08      Car Maniac    437   
1          10062  kjwiFMxRotQ   85      2022-03-08          Dennis      0   
2          10063  kjwiFMxRotQ   85      2022-03-08       Karl Napp      1   
3          10064  kjwiFMxRotQ   85      2022-03-08           Ed Sa      0   
4          10065  kjwiFMxRotQ   85      2022-03-08      MetalJaska      0   
...          ...          ...  ...  ...        ...             ...    ...   
562223    572284  W1MsL8Z8sIo  144      2022-03-15  Horst Bottesch      2   
562224    572285  W1MsL8Z8sIo  144      2022-03-15       Max Solar      3   
562225    572286  W1MsL8Z8sIo  144      2022-03-15    n8flight2403      2   
562226    572287  W1MsL8Z8sIo  144      2022-03-15       Tim Taler      2   
562227    572288  W1MsL8Z8sIo  144      2022-03-15    Rene Matthes      2   

         published     updated  reply_count                  comment_id  \


In [29]:
comments_SVM_list = comments_SVM_df.values.tolist()
replies_SVM_list = replies_SVM_df.values.tolist()

print(len(replies_SVM_list))
print(len(replies_df))


print(comments_SVM_list)
print(len(comments_df))

comments_df["label"] = comments_SVM_list
replies_df["label"] = replies_SVM_list

comments_df["label"] = comments_df["label"].str[0]
replies_df["label"] = replies_df["label"].str[0]


417054
417054
[['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['spam'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'], ['ham'

In [30]:
print(comments_df)
print(replies_df)

        table_id     video_id  job page       date          author  likes  \
0          10061  kjwiFMxRotQ   85      2022-03-08      Car Maniac    437   
1          10062  kjwiFMxRotQ   85      2022-03-08          Dennis      0   
2          10063  kjwiFMxRotQ   85      2022-03-08       Karl Napp      1   
3          10064  kjwiFMxRotQ   85      2022-03-08           Ed Sa      0   
4          10065  kjwiFMxRotQ   85      2022-03-08      MetalJaska      0   
...          ...          ...  ...  ...        ...             ...    ...   
562223    572284  W1MsL8Z8sIo  144      2022-03-15  Horst Bottesch      2   
562224    572285  W1MsL8Z8sIo  144      2022-03-15       Max Solar      3   
562225    572286  W1MsL8Z8sIo  144      2022-03-15    n8flight2403      2   
562226    572287  W1MsL8Z8sIo  144      2022-03-15       Tim Taler      2   
562227    572288  W1MsL8Z8sIo  144      2022-03-15    Rene Matthes      2   

         published     updated  reply_count                  comment_id  \


In [31]:
print(comments_df[comments_df['label'].isnull()])
print(replies_df[replies_df['label'].isnull()])

Empty DataFrame
Columns: [table_id, video_id, job, page, date, author, likes, published, updated, reply_count, comment_id, comment, translation, label]
Index: []
Empty DataFrame
Columns: [table_id, video_id, job, page, date, parent_id, author, likes, published, updated, comment, translation, label]
Index: []


In [33]:
comments_df.reset_index().to_json("comments_cleaned.json", orient='records')
replies_df.reset_index().to_json("replies_cleaned.json", orient='records')

In [35]:
ham_comments_df = comments_df.drop(comments_df[comments_df["label"] == "spam"].index)
ham_replies_df = replies_df.drop(replies_df[replies_df["label"] == "spam"].index)

In [37]:
print(len(comments_df))
print(len(replies_df))
print(len(ham_comments_df))
print(len(ham_replies_df))

550286
417054
531827
400533


In [38]:
ham_comments_df.reset_index().to_json("comments_cleaned-ham.json", orient='records')
ham_replies_df.reset_index().to_json("replies_cleaned-ham.json", orient='records')