### Test function

In [1]:
import nltk
import spacy
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/leon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
sample_text = "What are you trying to do?  Why can't you just store the \"Range\"?"

In [3]:
nlp = spacy.load('en_core_web_lg')
doc = nlp(sample_text)
for token in doc:
    print(token.text, token.dep_, token.head.i, token.head.text, [child for child in token.children])

What dobj 5 do []
are aux 3 trying []
you nsubj 3 trying []
trying ROOT 3 trying [are, you, do, ?]
to aux 5 do []
do xcomp 3 trying [What, to]
? punct 3 trying [ ]
   6 ? []
Why advmod 13 store []
ca aux 13 store []
n't neg 13 store []
you nsubj 13 store []
just advmod 13 store []
store ROOT 13 store [Why, ca, n't, you, just, Range, ?]
the det 16 Range []
" punct 16 Range []
Range dobj 13 store [the, ", "]
" punct 16 Range []
? punct 13 store []


In [4]:
from spacy import displacy
displacy.render(doc, jupyter = True)

In [5]:
from politeness.scripts.format_input import *
from politeness.scripts.train_model import *

In [6]:
def get_parses(spacy_doc):
    parse = {'deps': [], 'sent': ""}
    try:
        parse_ls = []
        for token in spacy_doc:
            parse_str = ""
            cur_dep = token.dep_
            head_text = token.head.text
            cur_text = token.text
            if cur_dep == 'ROOT':
                cur_dep = cur_dep.lower()
                head_id = 0
                head_text = 'ROOT'
                cur_id = token.i + 1
            else:
                head_id = token.head.i + 1
                cur_id = token.i + 1
            parse_str += cur_dep + "(" + head_text + "-" + str(head_id) + ", " + cur_text + "-" + str(cur_id) + ")"
            parse_ls.append(parse_str)
        parse['deps'] = parse_ls
        parse['sent'] = spacy_doc.text
    except Exception as e:
        print(e)
    return parse

In [7]:
def format_doc(doc, score):
    sents = get_sentences(doc)
    raw_parses = []
    for sent in sents:
        nlp_sent = nlp(sent)
        raw_parses.append(get_parses(nlp_sent))
    parse_dict = {"text": doc,
                "sentences": [],
                "parses": [],
                "score": score}
    for raw in raw_parses:
        parse_dict['sentences'].append(clean_treeparse(raw['sent']))
        parse_dict['parses'].append(raw['deps'])
    return parse_dict

In [8]:
parse_dict = format_doc(sample_text, -0.7)

In [19]:
#clf = train_lgb(full_document, 500)

Splitting Testing and Training Docs...
Saving Testing Docs for Later...
Fitting...
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       217
         1.0       0.57      1.00      0.72       283

    accuracy                           0.57       500
   macro avg       0.28      0.50      0.36       500
weighted avg       0.32      0.57      0.41       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Prepration for training

In [23]:
# Load X, y vector
with open('X_full.pkl', 'rb') as f:
    X = pickle.load(f)
with open('y_full.pkl', 'rb') as f:
    y = pickle.load(f)

In [24]:
X.shape

(10956, 796)

In [26]:
type(y)

numpy.ndarray

In [31]:
import numpy as np
print("Count of label = 1: ", np.count_nonzero(y > 0))
print("Count of label = 0: ", (y == 0).sum()) 

Count of label = 1:  6036
Count of label = 0:  4920


#### Split to train and test set using stratified split

In [42]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
X_sm, y_sm = sm.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
        X_sm, y_sm, test_size=0.33, random_state=42, stratify = y_sm)

In [43]:
print("Count of label = 1 in train data: ", np.count_nonzero(y_train > 0))
print("Count of label = 0 in train data: ", (y_train == 0).sum()) 

Count of label = 1 in train data:  4044
Count of label = 0 in train data:  4044


In [44]:
print("Count of label = 1 in test data: ", np.count_nonzero(y_test > 0))
print("Count of label = 0 in test data: ", (y_test == 0).sum()) 

Count of label = 1 in test data:  1992
Count of label = 0 in test data:  1992


In [53]:
# standardize data
from sklearn.preprocessing import MinMaxScaler, StandardScaler
standardizer = StandardScaler(with_mean = False)
standardizer.fit(X_train)
X_train = standardizer.transform(X_train)
X_test = standardizer.transform(X_test)

In [99]:
# load model packages
import xgboost as xgb
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Load some metrics
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import classification_report

# Load other sklearn packages
from sklearn.model_selection import GridSearchCV

# Load other packages
from tqdm import tqdm

In [100]:
# #save and load model
# filename = 'finalized_model.sav'
# pickle.dump(models_to_train[2], open(filename, 'wb'))
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)

#### Try several models

In [105]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, save_name, cv=5, scoring_fit='accuracy', 
                       scoring_test=accuracy_score):
    
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        # n_jobs=4, 
        scoring=scoring_fit,
        verbose=3
    )    
    fitted_model = gs.fit(X_train_data, y_train_data)
    best_model = fitted_model.best_estimator_
    pred = fitted_model.predict(X_test_data)
    score = scoring_test(y_test_data, pred)
    pickle.dump(best_model, open(save_name, 'wb'))
    print('current best estimator is: ', best_model)
    print('current best prediction score is: ', score)
    # show current classification report
    print(classification_report(y_test_data, pred))
    return [best_model, pred, score]

In [106]:
grid_parameters = [
    {# svm
        'C': [0.02, 0.1],
        'kernel': ['linear', 'rbf']
    },
    {# xgb
        'n_estimators': [200, 220, 240, 260],
        'max_depth': [8, 9, 10, 11, 12],
        'subsample': [0.9, 1],
        'colsample_bytree': [0.9, 1],
        'reg_alpha':[0,0.01,0.015,0.02],
        'eta': [0.15, 0.2, 0.25]
    },
    {# random forest
        'n_estimators': [200, 220, 240, 260],
        'max_depth': [8, 9, 10, 11, 12],
        'max_features': [3, 4, 5, 6],
        'min_samples_leaf': [3, 4, 5],
        'min_samples_split': [3, 4, 5]
        
    },
    {# gradient boosting
        'n_estimators': [200, 220, 240, 260],
        'max_depth': [8, 9, 10, 11, 12],
        'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
        'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
        'max_features': list(range(1,35,3)),
        'learning_rate': [0.15,0.2,0.25] 
    },
    
]

In [107]:
models_to_train = [svm.SVC(random_state = 88),
                   xgb.XGBClassifier(eval_metric=['logloss','auc','error'],use_label_encoder=False,random_state = 88),        
                   RandomForestClassifier(random_state = 88),
                   GradientBoostingClassifier(random_state = 88)]
model_save_name = ['final_svm.sav', 'final_xgb.sav', 'final_rf.sav', 'final_gb.sav']

In [1]:
models_preds_scores = []
# loop through each model, 4 in this case
for i, model in tqdm(enumerate(models_to_train)):
    params = grid_parameters[i]
    
    result = algorithm_pipeline(X_train, X_test, y_train, y_test,
                                 model, params, model_save_name[i], cv=5)
    models_preds_scores.append(result)

NameError: name 'tqdm' is not defined

#### Load model from sav file

In [109]:
# Try a sample model
svm_model = pickle.load(open('final_svm.sav', 'rb'))
# Predict
svm_result = svm_model.predict(X_test)
# Pring classification report
print(classification_report(y_test, svm_result))

              precision    recall  f1-score   support

         0.0       0.62      0.70      0.65      1992
         1.0       0.65      0.57      0.61      1992

    accuracy                           0.63      3984
   macro avg       0.63      0.63      0.63      3984
weighted avg       0.63      0.63      0.63      3984



In [110]:
# Check xgb model
xgb_model = pickle.load(open('final_xgb.sav', 'rb'))
# Predict
xgb_result = xgb_model.predict(X_test)
# Pring classification report
print(classification_report(y_test, xgb_result))

              precision    recall  f1-score   support

         0.0       0.67      0.66      0.67      1992
         1.0       0.67      0.68      0.67      1992

    accuracy                           0.67      3984
   macro avg       0.67      0.67      0.67      3984
weighted avg       0.67      0.67      0.67      3984



In [111]:
# Check random forest model
rf_model = pickle.load(open('final_rf.sav', 'rb'))
# Predict
rf_result = rf_model.predict(X_test)
# Pring classification report
print(classification_report(y_test, rf_result))

              precision    recall  f1-score   support

         0.0       0.63      0.69      0.66      1992
         1.0       0.66      0.59      0.62      1992

    accuracy                           0.64      3984
   macro avg       0.64      0.64      0.64      3984
weighted avg       0.64      0.64      0.64      3984

