# Importing Necessary Things

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
import emoji
import pickle

# Function for Cleaning Messages

In [75]:
sw = stopwords.words('english')
wn = WordNetLemmatizer()

def clean_data(text):
    text = emoji.demojize(text) # convert the emoji to it's textual meaning 
    text = text.lower() # coerce data to lower case
    tokens = wordpunct_tokenize(text) # tokenize individual words
    tokens = [tok for tok in tokens if tok.isalnum()] # removing punctuation
    tokens = [tok for tok in tokens if tok not in sw] # removing stop words
    tokens = [wn.lemmatize(tok) for tok in tokens] # lematizing lyrics - reducing to base words
    return " ".join(tokens)

# Function for creating training and testing. Takes as input:
    - the path to the file where the csv of data is stored
    - the column name for where the messages are stored
    - the column name for where the labels are stored
    - the path to where the training data should be stored
    - the path to where the testing data should be stored
# It returns:
    - a dataframe with testing data
    - a dataframe with training data
# It also: 
    - exports both training and testing data to CSVs

In [12]:
def make_train_test_datasets(filepath, message_column_label, label_column_label, training_data_path, testing_data_path):
    df = pd.read_csv(filepath)
    df[message_column_label] =df[message_column_label].astype(str)
    df = df.dropna()
    df = df[~df[label_column_label].isin(['10', '$0'])]
    df[label_column_label] = df[label_column_label].astype(int)
    df[message_column_label] = df[message_column_label].apply(lambda x: clean_data(x))
    df = df[df[message_column_label] != '']
    df = df[df[message_column_label] != '']
    
    bad_messages = df[df[label_column_label] == 1]
    bad_train = bad_messages.head(int(len(bad_messages)*(70/100)))
    bad_train = bad_train.reset_index(drop=True)
    bad_test = bad_messages.iloc[max(bad_train.index):]

    fine_messages = df[df[label_column_label] == 0]
    fine_train = fine_messages.head(3578)
    fine_train = fine_train.reset_index(drop=True)
    fine_messages = fine_messages.iloc[3579:]
    fine_messages = fine_messages.sample(frac=1).reset_index(drop=True)
    fine_test = fine_messages.head(1535)


    train = pd.concat([bad_train, fine_train], axis=0)
    test = pd.concat([bad_test, fine_test], axis=0)

    train = train.sample(frac=1).reset_index(drop=True)
    test = test.sample(frac=1).reset_index(drop=True)

    train.to_csv(training_data_path)
    test.to_csv(testing_data_path)
    
    return train, test

# A scoring function used to test the acccuracy of the two-pronged approach

In [48]:
def getScores(preds, labels):
    total_same = 0
    total_pos = 0
    total_neg = 0
    true_positives = 0
    false_negatives = 0
    false_positives = 0
    for i in range(len(preds)):
    #     print(preds[i])
    #     print(labels[i])
    #     print()
        if labels[i] == 1:
            total_pos += 1

        if labels[i] == 0:
            total_neg += 1

        if preds[i] == labels[i]:
            total_same += 1

        if preds[i] == 1 and labels[i] == 1:
            true_positives += 1

        if preds[i] == 0 and labels[i] == 1:
            false_negatives += 1

        if preds[i] == 1 and labels[i] == 0:
            false_positives += 1

    recall =  true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    f1 = 2*((precision*recall)/(precision+recall))
    return total_same / len(preds), recall, precision, f1

# Predicion function used in the real version

In [64]:
def predict(bad_words_filepath, model, cv, messages):
    df = pd.read_csv(bad_words_filepath)
    df = list(df)

    my_dict = {}
    for i in range(len(df)):
        df[i] = df[i].strip()
    for i in df:
        my_dict[i] = 1

    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    predictions = []
    for message in messages:
        checked = False
        # check if any of the words automatically imply inappropriate
        for word in message.split():
            for char in word:
                if char in punc:
                    word = word.replace(char, "")
            if word.lower() in my_dict:
                predictions.append(1)
                checked = True
                break
        if checked:
            continue
                
        sample_text = clean_data(message)

        sample_text = [sample_text]
        sample_cv = cv.transform(sample_text)

        sample_df = pd.DataFrame(sample_cv.toarray(), columns = cv.get_feature_names())

        # predict on sample message
        val = model.predict(sample_df)[0]
        predictions.append(val)
    return predictions

# Function to train a random forest classifier
This function will train and test the model until it finds one with an acceptable accuracy. This is necessary because of the random nature of the random forest classifier. 

In [78]:
def train_model(training_data, testing_data, message_column_label, label_column_label, bad_words_filepath):
    best_cv = None
    best_classifier = None
    best_accuracy = 0
    while best_accuracy < 0.82:
        cv = CountVectorizer(max_features = 3000)
        X = cv.fit_transform(training_data[message_column_label]).toarray()
        y = training_data[label_column_label].values

        rf = RandomForestClassifier() 
        rf.fit(X, y)
        
        test_X = testing_data[message_column_label]
        test_y = testing_data[label_column_label].values

        y_hat = predict(bad_words_filepath, rf, cv, test_X)
        print(len(y_hat))
        accuracy, recall, precision, f1 = getScores(y_hat, test_y)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_classifier = rf
            best_cv = cv
    return best_classifier, best_cv, accuracy, recall, precision, f1

# Pickling the model, vectorizer, and dictionary of bad words to be put up on AWS

In [79]:
def pickle_model(model_filename, model, cv_filename, cv, dictionary_filename, dictionary):
    pickle.dump(model, open(model_filename, 'wb'))
    pickle.dump(cv, open(cv_filename, 'wb'))
    pickle.dump(dictionary, open(dictionary_filename, 'wb'))

# Example to get you started

In [40]:
training, testing = make_train_test_datasets("mutual_messages.csv", "Message", "Is Inappropriate", "training.csv", "testing.csv")

In [65]:
classifier, cv, accuracy, recall, precision, f1 = train_model(training, testing, "Message", "Is Inappropriate", "dictionary/bad_words.csv")

3070
3070
3070


In [66]:
print(accuracy)
print(recall)
print(precision)
print(f1)

0.8495114006514658
0.8175895765472313
0.8733472512178149
0.844549125168237


In [74]:
df = pd.read_csv("dictionary/bad_words.csv")
df = list(df)

my_dict = {}
for i in range(len(df)):
    df[i] = df[i].strip()
for i in df:
    my_dict[i] = 1
    
pickle_model("model.pkl", classifier, "cv.pkl", cv, "bad_words", my_dict)