<a href="https://colab.research.google.com/github/DavinciB/child_grooming_detector/blob/main/ProjectDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
from sklearn import svm
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import metrics
import heapq
import operator
import numpy as np
from mpl_toolkits.mplot3d import axes3d, Axes3D
from sklearn.svm import LinearSVC
import xml.etree.ElementTree as ET
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import pickle

In [27]:
def get_labels_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict

def get_features_labels(root, labels_dict):
    corpus = []
    labels = []
    for conversation in root:
        string = " "
        for message in conversation:
            text = message.find('text').text
            if text is not None:
                string = string + "\r\n" + text
        corpus.append(string)
        labels.append(int(labels_dict[conversation.get('id')]))
    return corpus, labels

def get_conversation_id(root):
  conversation_id = []
  for conversation in root:
    conversation_id.append(conversation.get('id'))
  return conversation_id

train_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()
test_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_test_data/'
test_data_src = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()
train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
test_corpus, test_labels = get_features_labels(test_root, get_labels_dict(test_data_path))
test_conversations = get_conversation_id(test_root)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

filename = '/content/drive/MyDrive/online-grooming-detector-master/models/GAI_SVM.sav'
loaded_model_GAI = pickle.load(open(filename, 'rb'))
pred_y = loaded_model_GAI.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
occurrences = np.count_nonzero(pred_y == 1)
print(occurrences)
for i in range(250):
    print("{} {}".format(test_conversations[i], pred_y[i]))


0.9851026249290908
2698
affc2df0951b733d14ba92d19d9b7695 0
de15188e9fd515ed817a0b34546be902 1
17784c5a093477c1706b1a68cea7c802 0
d55846cb89c0fbf5e177b0b0d499023e 0
85f0abac6ef5a2a23814a2ced73b5fb7 0
80e3c3978ea07f46819f1f945cb04949 0
15a38614944f5aa8bbb59d4153ecc12a 0
e6287e57c448a16a1ae5e2e909695d09 0
375d3b23561d0c5efd7e2883b9646fd6 0
0b466b3b32d51ae11dbbfe0555101aae 0
db1b0c9e79572b647386c80ce71b4b5b 0
67bad431706277feac87a8df9d2ebcf9 0
031b0d1a9da5032805285b3515a8b2af 0
f251b1dd5ca388c3e7edc6e30750a358 0
d5e6465b5698114b451d11b1f4a1fb81 0
26635f6a6b1272dd14c138703e57134a 0
9b317a61e8822eabe2d9249fa1ad5eef 0
23aeaf705a589bde762bfeada5d25136 0
acc07071061f3dfd58d6f916749a534e 0
437ecde1407d29f2be6712381d17922a 0
f0fe735cd1ad50c431f87a4c88518ab6 0
fb40d59d5e80409915cd873f33d89d6f 0
ee0aba2365c16a30cb36d1057a390b16 0
ca1b963746e368a4cec38ab4be3fdcf7 0
665eeb427170b16ab6eb6e2c446d53a1 0
460631341561030a487180a948790387 0
942a096a173bed5064887bee8393747e 0
4d278fb90d5b6d648ff8a5750a932da

In [29]:
for i in range(10000):
  if (pred_y[i] == 1) :
    print("{} {}".format(test_conversations[i], pred_y[i]))

de15188e9fd515ed817a0b34546be902 1
a13aeec58a2b099de6ef50ffcd861334 1
3763edf61689c00884dba353dba5352e 1
5fd32399bbb0b3398f6d3aa619f53e33 1
43b0ed08eb9105ceb0271ae70f46c950 1
ebff860be53128280a73f171e184126f 1
990c846a48b78567f57f2efc1e1cbf61 1
3359a827513753359e43444affc28632 1
73655da9f71dc4049e2975d6b98ca06d 1
6eab795c5f6a9d822d25a2b153736799 1
7c0004d2d9aa198bc0f920a2ed397d6b 1
003ec8f1205bd22223642cbc421c24d2 1
2b26b41929eb8414ee81e1ac1b0c9b93 1
fcdf2c1308dadce5681a417c60473451 1
15425bb7dde8d4ec9e4c6146f5db195e 1
a0710c9450b83729cee263204c0aca34 1
42d17148243c2eaad95e52aa794433d4 1
e3e2daa1b3db76a3bd2c8710291584c5 1
54fd82ea8269ee146af424f0a94a9520 1
fa9875f8a2faaa458fe791931da766d9 1
0637d6abd4e2622c6e57df2d28aadd5d 1
ddcf0872cfa5af51d6cb4998cf164e40 1
71fd95e5779c0017dc104bd46d183ed9 1
14a4096fbfe7260e305c1b477a42171b 1
cc5fba01f752fae4846aed8f26731b7b 1
4406aad767720e999464dea17ff91826 1
7ff9c3cee046b95c554e895bfe866e21 1
23a3e3e4a64a28fa92a74c50b5b1e7ee 1
bfcf89e873d143b338dd

In [14]:
test_corpus = []
test_labels = []
pred_y = []
def get_susp_conv_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict

def get_predators_dict(file): 
    all_predators = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            all_predators[row[0]] = 1
    return all_predators
            
def get_features_labels(root, labels_dict, all_predators):
    corpus = []
    labels = []
    author_list = []
    for conversation in root:
        if labels_dict[conversation.get('id')] == '0':
            continue
        author_conv_dict = {}
        for message in conversation:
            author = message.find('author').text
            text = message.find('text').text
            if text is not None:
                if author not in author_conv_dict:
                    author_conv_dict[author] = text
                else:
                    author_conv_dict[author] += " " + text 
        for author, conv in author_conv_dict.items():
            corpus.append(conv)
            author_list.append(author)
            if author in all_predators:
                labels.append(1)
            else:
                labels.append(-1)
    return corpus, labels, author_list



train_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()

test_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_test_data/'
test_data_src = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()

pred_train_file_path = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt'
pred_test_file_path = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt'
train_corpus, train_labels, train_authors = get_features_labels(train_root, get_susp_conv_dict(train_data_path), get_predators_dict(pred_train_file_path))
test_corpus, test_labels, test_authors = get_features_labels(test_root, get_susp_conv_dict(test_data_path), get_predators_dict(pred_test_file_path))


vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

filename = '/content/drive/MyDrive/online-grooming-detector-master/models/PI_SVM.sav'
loaded_model_PI = pickle.load(open(filename, 'rb'))
pred_y = loaded_model_PI.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
occurrences = np.count_nonzero(pred_y == 1)
print(occurrences)
print("Suspicious authors are")
for i in range(len(test_authors)//4):
    print("{} {}".format(test_authors[i], pred_y[i]))

0.8505706134094151
3207
Suspicious authors are
b6fe182274453b707870b16e5d2ad562 -1
4a9332d7466b98d11c23e4447b26460a 1
a8e6e3985a82dfde8ee95b5f099ec606 -1
e03aa9707bd13f180c517ae1a47e9da2 -1
c05dab2bc28c161ccef69000520dc050 -1
c92808e9cfb0834a62a670e613637377 1
b7ef10a3deaa560a5d5dcad89a941e41 -1
60550136cc7137c7bce945ae1fd82967 1
8956fe7bf020de2f0e9593443b3d9d1e -1
941dd80263686adb071df2172badd426 1
fce23ce4bcc7bcdef65385dca0575523 -1
54b595f1920b5b1988e907ea693303b4 -1
c3e302119676d8e7ddd7ba5791d2876a 1
8c4078d55ba07096949e82f0993a423b -1
ed246e489407df944749dc0870274679 1
2eba3cbb71e6ea5af3ede4d7b898f99d 1
3e4a51f98397c7b41ea8eafa7d0f6a12 -1
fce23ce4bcc7bcdef65385dca0575523 1
b03efd14f0f503f604facbdb66aa8065 -1
3b134d48a7081997ca2f5a1246756362 -1
6be1cba45fefdf24824c32dafb09cdc1 -1
13396578cb61bd3ccd2b13c1650be421 1
f3623baecefb4518f4a96244666575a0 -1
d2cd98d625d8f8d91f78497efd39a74f 1
6e8c876c80a2ce6412b4ea28715c7ca1 -1
c938fccbd1690526f6045b28820c0a48 1
e2f00473c1d8bc8331b29ab36e99