In [1]:
import numpy as np
import re
from sklearn.datasets import load_files
import pickle
import pandas as pd
import spacy

In [2]:
data = pd.read_csv("inappropriate.csv")
docs = data["text"].tolist()
y = data["flag"].tolist()

In [3]:
#CONSIDER: can probably use a smaller model
nlp = spacy.load("en_core_web_lg")

In [4]:
#Bag of lemmas
df_count = pd.DataFrame()
for entry in docs:
    doc = nlp(entry)
    dict_count = dict()
    for token in doc:
        dict_count[str(token.lemma)] = dict_count.get(str(token.lemma),0)+1
    df_count = pd.concat([df_count,pd.Series(dict_count)],ignore_index=True,axis=1)
df_count = df_count.transpose().fillna(0)

feature_names = df_count.columns.values

df_count

Unnamed: 0,3883960749573218104,3109722610398367559,2177690308569554941,14121509715367036122,3791531372978436496,6880656908171229526,7624161793554793053,18307573501153647118,14692702688101715474,17050719033203801181,...,17024108707146051813,6075629963142282131,16230405457545860715,12367442812140488931,12747335289542760454,13402777656386554723,80859674766354010,10937452440342651049,8373539507428251961,16043492340495565256
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [5]:
# Tfid
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(df_count).toarray()

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from sklearn.svm import SVC
classifier = SVC(kernel="rbf")
classifier.fit(X_train, y_train) 

In [8]:
y_pred = classifier.predict(X_test)
y_pred

array([ True,  True,  True, False, False, False, False, False,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False,  True, False, False, False,  True, False, False, False,
        True])

In [9]:
#Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[20  4]
 [ 7  6]]
              precision    recall  f1-score   support

       False       0.74      0.83      0.78        24
        True       0.60      0.46      0.52        13

    accuracy                           0.70        37
   macro avg       0.67      0.65      0.65        37
weighted avg       0.69      0.70      0.69        37

0.7027027027027027


In [10]:
# SAVE
classifier.feature_names = feature_names
with open('message_classification', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [11]:
# with open('text_classifier', 'rb') as training_model:
#     model = pickle.load(training_model)