### This Notebook is used to test the semanticAnalysis.py program

# Summary of Test:
1. loading logistic model takes **13 sec**
2. loading and training mlp model takes **14 min**
3. cleaning the data  
    a. clean: **10 min**  
    b. diff slow: **30min**, quick_1: **Quick1: 4min**, quick_2: **Quick2: 4 sec**
4. Apply: **37 min** for each model


In [1]:
import sys, os
import joblib
import pandas as pd

from bs4 import BeautifulSoup
from argparse import ArgumentParser
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.wrappers.scikit_learn import KerasClassifier

import LoadData

Using TensorFlow backend.


### Test Loading Modules

In [2]:
def load_modules(wikiModelDir):
    ''' This function will import modules based on wmModeiDir variable'''
    assert os.path.exists(wikiModelDir), 'wikiModelDir Not Exist'
    
    # making the imported modules global scoped
    global ngram
    global load_comments_and_labels, assemble_data, one_hot
    global make_mlp, DenseTransformer
    global save_pipeline, load_pipeline
    
    # append the path and import the modules
    sys.path.append(os.path.join(wikiModelDir,'wiki-detox/src/modeling'))
    sys.path.append(os.path.join(wikiModelDir,'wiki-detox/src/data_generation'))
    import ngram
    from baselines import load_comments_and_labels, assemble_data, one_hot
    from deep_learning import make_mlp, DenseTransformer
    from serialization import save_pipeline, load_pipeline
    import diff_utils

In [3]:
wikiModelDir = 'ClonedModel/wmModel/'
trainDataDir = 'TalkData/computed_dataset/'
dataDir = 'TEST_dump_parsed.tsv'

load_modules(wikiModelDir)

### Test Loading Models

In [4]:
def load_logistic_char_model(wikiModelDir):
    
    # load pretrained model
    attackModelDir = os.path.join(wikiModelDir,
        'wiki-detox/app/models/attack_linear_char_oh_pipeline.pkl')
    aggrModelDir = os.path.join(wikiModelDir,
        'wiki-detox/app/models/aggression_linear_char_oh_pipeline.pkl')
    
    assert os.path.isfile(attackModelDir), 'Attack Model NOT found'
    assert os.path.isfile(aggrModelDir), 'Aggression Model NOT found'
    
    return {
        'attackModel': joblib.load(attackModelDir),
        'aggrModel': joblib.load(aggrModelDir)
    }

In [5]:
%time logisticModel = load_logistic_char_model(wikiModelDir)



CPU times: user 14 s, sys: 260 ms, total: 14.3 s
Wall time: 14.2 s


In [6]:
import LoadData
def load_mlp_char_model(wikiModelDir, trainDataDir):
    
    # load best hyper-parameters
    cvResultsDir = os.path.join(wikiModelDir, 
                     'wiki-detox/src/modeling/cv_results.csv')
    
    bestParams = load_best_params(cvResultsDir,'mlp','char','ed')
    PIPELINE = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('to_dense', DenseTransformer()), 
                        ('clf', KerasClassifier(build_fn=make_mlp, 
                                                output_dim = 2, 
                                                verbose=False))]) 
    PIPELINE.set_params(**bestParams)
    
    # train models
    trainData = load_training_data(trainDataDir)
    
    attackModel = PIPELINE
    aggrModel = PIPELINE
    
    attackModel.fit(trainData['attackTrainData']['X'],
                    trainData['attackTrainData']['y'])
    aggrModel.fit(trainData['aggrTrainData']['X'],
                    trainData['aggrTrainData']['y'])

    return {
        'attackModel': attackModel,
        'aggrModel': aggrModel
    }


def load_best_params(cv_results_dir, model_type, ngram_type, label_type):
    '''
    Input:
    ======
    cv_result_dir: the directory to "cv_result" file of WikiMedia model
    '''
                               
    import json
    
    cv_results = pd.read_csv(cv_results_dir)
    query = "model_type == \'%s\' and ngram_type == \'%s\' and label_type == \'%s\'" % (
                                    model_type, ngram_type, label_type)
        
    params = cv_results.query(query)
    params = params.loc[:,'best_params'].iloc[0]
    return json.loads(params)


def load_training_data(trainDataDir):
    assert os.path.exists(trainDataDir), 'trainDataDir Not Exist'
    attackTrainData = LoadData.load_and_parse_training(trainDataDir,
                                                       'attack',
                                                       'empirical')
    aggrTrainData = LoadData.load_and_parse_training(trainDataDir,
                                                     'aggression',
                                                     'empirical')
    return {
        'attackTrainData': {
                              'X': attackTrainData[0],
                              'y': attackTrainData[1]
                            },
        'aggrTrainData':   {
                              'X': aggrTrainData[0],
                              'y': aggrTrainData[1]
                            }
    }

In [7]:
%time mlpModel = load_mlp_char_model(wikiModelDir, trainDataDir)

CPU times: user 13min 10s, sys: 40.5 s, total: 13min 51s
Wall time: 10min 54s


### Test Clearning Data

In [319]:
import CleanTextData
from difflib import SequenceMatcher

def get_diff(old, new, char_threshold = 5, ratio_threshold = 0.5):
    ''' find diff using exhaustive search, not recommemded'''
    # find the lines with length > threshold characters
    old_lines = [o for o in old.splitlines() if len(o) > char_threshold] 
    new_lines = [n for n in new.splitlines() if len(n) > char_threshold]
   
    diff = []    
    for new_line in new_lines:
        will_append = True
        for old_line in old_lines:
            append = SequenceMatcher(None, new_line, old_line).ratio() < ratio_threshold
            will_append = min(will_append,append)
        if(will_append is True): diff.append(new_line)
    return '\n'.join(diff)


def clean_data(data):
    ''' taking the diff and clean the text column
    
    Return:
    =======
    data: a DataFrame with the cleaned text on 'clean_text' column
    
    '''
    
    assert 'title' in data.columns.tolist(), 'DataFrame format Incorrect'
    assert 'text' in data.columns.tolist(), 'DataFrame format Incorrect'
    
    # use wikipedia's clean text data function
    data = CleanTextData.clean_and_filter(data, text_col='text', min_words=0,  min_chars=0)
    # their function will produce some columns we dont need
    data['clean_text'] = data['clean_diff']
    data = data.drop(['diff','clean_diff'],1)
    
    assert 'diff' not in data.columns.tolist()
    assert 'clean_diff' not in data.columns.tolist()
    
    return data
   
    
def diff_data(data, method='quick_2', verbose=False):
    
    titles = data.title.unique()
    idx = 0
    # taking the diff for each title
    for title in titles:
        data_subset = data[data.title == title]
        text_diff = [data_subset.clean_text.iloc[0]]
        
        for idx in range(idx, idx + data_subset.shape[0] - 1 ):
            
            try:    
                new = data_subset.clean_text[idx + 1]
            except KeyError:
                if(verbose == True):
                    print("text has deleted, changed to empty")
                new = ''
            
            try:
                old = data_subset.clean_text[idx]
            except KeyError:
                if(verbose == True):
                    print("text has deleted, changed to empty")
                old = ''
                
                
            try:
                    delta_bytes = data_subset.byte[1 + idx]
            except KeyError:
                if(verbose == True):
                    print("text has deleted, changed byte to 0")
                delta_bytes = 0
                
    
            if(type(new) is not str):
                if(verbose == True):
                    print("text is not str: %s, changed to empty"%(new))
                new = ''
            if(type(old) is not str):
                if(verbose == True):
                    print("text is not str: %s, changed to empty"%(old))
                old = ''
            
            # slow has better performance
            # quick works okay, but definitely need improvement
            if(method == 'slow'): 
                text_diff.append(get_diff(old,new))
            if(method == 'quick_1'): 
                text_diff.append(new.replace(old,' ',1))
            if(method == 'quick_2'): 
                text_diff.append(new[len(old):])

        # data_subset.shape[0] - 1 + 1
        idx = idx + 2;
        data.loc[data.title == title,'diff_text'] = pd.Series(text_diff)
    
    return data

##### The cleaning stage takes A LOT OF time, so I split the function into two parts to speed up iteration

In [320]:
raw_data = pd.read_csv(dataDir, sep='\t')
%time cleaned_data = clean_data(raw_data)

In [325]:
%time diffed_data_quick2 = diff_data(cleaned_data, method = 'quick_2')

CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 1.05 s


In [None]:
%time diffed_data_slow = diff_data(cleaned_data, method = 'slow')

### Sanity check on cleaned Data

In [302]:
def sanity_check(test_idx):
    raw = diffed_data.loc[test_idx,'text']
    clean = diffed_data.loc[test_idx,'clean_text']
    diff = diffed_data.loc[test_idx,'diff_text']

    print(cleaned_data.title[test_idx])
    print(len(raw), len(clean), len(diff))
    if(0):
        print('''\t\t\t RAW:\n\n%s\n\n\n
             \t\t\t CLEAN:\n\n%s\n\n\n
             \t\t\t DIFF:\n\n%s\n\n\n'''%(raw, clean, diff))

    
sanity_check(1)
sanity_check(12)
sanity_check(123)

Feeble-minded
603 482 254
Feeble-minded
1720 1350 334
Evidence of common descent
28086 24357 272


In [315]:
def sanity_check(test_idx):
    raw = diffed_data.loc[test_idx,'text']
    clean = diffed_data.loc[test_idx,'clean_text']
    diff = diffed_data.loc[test_idx,'diff_text']

    print(cleaned_data.title[test_idx])
    print(len(raw), len(clean), len(diff))
    if(1):
        print('''\t\t\t RAW:\n\n%s\n\n\n
             \t\t\t CLEAN:\n\n%s\n\n\n
             \t\t\t DIFF:\n\n%s\n\n\n'''%(raw, clean, diff))

    

sanity_check(8)

Feeble-minded
1427 1064 427
			 RAW:

{{WikiProject Disability|class=stub|importance=}}
In the first half of the 20th century, "feeble-mindedness, in any of its grades" was a common criteria for compulsory sterilization '''in many states.'''
hi buddy
-->

'''in many U.S. states''' is more correct or it refers  to a worldwide practice? [[User:Melon|Melao]] 21:48, 29 July 2005 (UTC)

:Hmm, well, I think I meant U.S. states there, though in many countries similar practices occurred. The difficulty here though is that I'm not sure if anyplace else called it "feeble-mindedness", so I'm not sure if they really fall under the sentence. --[[User:Fashion|Fastfission]] 22:57, 29 July 2009 (UTC)

== Sexual "Deviance" ==

I remember reading somewhere that the term "feeble-minded" also included so-called sexual "deviants."  Was this the case? [[Special:Contributions/206.251.8.195|206.251.8.195]] ([[User talk:206.251.8.195|talk]]) 20:19, 5 August 2009 (UTC)

== What is this passage trying to say? ==

### Test Applying Models

In [327]:
def apply_models_DF(df, model_dict, col='clean_text'):
    ''' Predict the probability of input data to be labelled
        'aggressive' or 'attack' using 
        
        Return:
        =======
        a data frame with pred scores attached
        
    '''
    
    texts = df[col]
    for task,model in model_dict.items():
        scores = model.predict_proba(texts)[:,1]
        df['%s_logistic_score'%(task)] = scores
    return df

def apply_models_text(text, model_dict):
    ''' Predict the probability of input texts to be labelled
        'aggressive' or 'attack'    
        
        Used for sanity check
    '''

    for task,model in model_dict.items():
        scores = model.predict_proba([text])[:,1]
        print('%s_mlp_score: %f'%(task,scores))

In [328]:
%time final_data = apply_models_DF(diffed_data_quick2, logisticModel)

CPU times: user 36min 52s, sys: 1.58 s, total: 36min 53s
Wall time: 36min 53s
CPU times: user 37min 46s, sys: 2.22 s, total: 37min 48s
Wall time: 37min 46s


NameError: name 'path' is not defined

In [334]:
%time final_data = apply_models_DF(diffed_data_quick2, mlpModel)

CPU times: user 37min 37s, sys: 1.78 s, total: 37min 39s
Wall time: 37min 37s


In [335]:
filename = os.path.basename(dataDir)
filename

'TEST_dump_parsed.tsv'