In [1]:
# importing packages
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
import xgboost as xgb

# nltk packages
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('punkt')
from string import punctuation
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
def sentences_to_tags(sentences, stype):
    stc_tokd = sentences['text'].map(nltk.word_tokenize)
    stc_tagd = stc_tokd.map(nltk.pos_tag)
    stc_as_tags = [[tup[1] for tup in sentence] for sentence in stc_tagd]
    stc = pd.DataFrame()
    stc['as_tags'] = stc_as_tags
    stc['type'] = stype
    return stc

In [3]:
def get_convert_and_tag(data, tags, stype):
    stc_tags = tags[tags['tag'] == stype]
    stc = data[data['id'].isin(stc_tags['id'])]
    stc = sentences_to_tags(stc, stype)
    return stc

In [5]:
data = pd.read_csv('eng_sentences.tsv', sep = '\t', names = ['id', 'lang', 'text'])

In [6]:
tags = pd.read_csv('tags.csv', sep = '\t', names = ['id', 'tag'])

In [7]:
stypes = ['present simple', 'present continuous', 'past simple', 'present perfect', 'present perfect continuous', 'future simple']

tags_and_type = pd.concat([get_convert_and_tag(data, tags, i) for i in stypes])

tags_and_type.to_csv('tnt.csv', index = False)

In [8]:
tokenizer = Tokenizer(num_words = 50, filters = '')
tokenizer.fit_on_texts(tags_and_type['as_tags'])
Xfs = tokenizer.texts_to_matrix(tags_and_type['as_tags'], mode = 'count')
yfs = tags_and_type['type']

In [29]:
yfs_with_numbers = np.copy(yfs)
for i in range(len(yfs_with_numbers)):
  if (yfs_with_numbers[i] == 'present simple'):
    yfs_with_numbers[i] = 0
  if (yfs_with_numbers[i] == 'present continuous'):
    yfs_with_numbers[i] = 1
  if (yfs_with_numbers[i] == 'past simple'):
    yfs_with_numbers[i] = 2
  if (yfs_with_numbers[i] == 'present perfect'):
    yfs_with_numbers[i] = 3
  if (yfs_with_numbers[i] == 'present perfect continuous'):
    yfs_with_numbers[i] = 4
  if (yfs_with_numbers[i] == 'future simple'):
    yfs_with_numbers[i] = 5

In [30]:
yfs_with_numbers

array([0, 0, 0, ..., 5, 5, 5], dtype=object)

In [32]:
train_X, test_X, train_y, test_y = train_test_split(Xfs, yfs_with_numbers, test_size=0.3, stratify=yfs_with_numbers, random_state=0)

In [35]:
clf = xgb.XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [37]:
from sklearn.pipeline import Pipeline, FeatureUnion
pipe = Pipeline([('clf',clf)])

In [33]:
param_grid = {
     'clf__n_estimators': [50,100,300],
     
}

In [39]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [42]:
train_y.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
grid_search.fit(train_X, train_y.astype(int))

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=0.6,
                                                      gamma=0,
                                                      learning_rate=0.1,
                                                      max_delta_step=0,
                                                      max_depth=3,
                                                      min_child_weight=1,
                                                      missing=None,
                                                      n_estimators=100,
                

In [45]:
grid_search.cv_results_['mean_train_score']

array([0.95349412, 0.95990517, 0.96580087])

In [46]:
grid_search.cv_results_['mean_test_score']

array([0.95213358, 0.95741084, 0.96013193])

In [44]:
grid_search.best_params_

{'clf__n_estimators': 300}

In [47]:
clf_test = grid_search.best_estimator_

In [48]:
preds = clf_test.predict(test_X)

In [49]:
def print_stats(preds, target, labels, sep='-', sep_len=40, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
    print('Classification report:')
    print(metrics.classification_report(target, preds))
    print(sep*sep_len)
    print('Confusion matrix')
    cm=metrics.confusion_matrix(target, preds)
    cm = cm / np.sum(cm, axis=1)[:,None]
    sns.set(rc={'figure.figsize':fig_size})
    sns.heatmap(cm, 
        xticklabels=labels,
        yticklabels=labels,
           annot=True, cmap = 'YlGnBu')
    plt.pause(0.05)

In [50]:
print_stats(test_y, preds, clf_test.classes_)

ValueError: ignored

In [63]:
headers = list(test_X)
# input_variable = pd.DataFrame(test_X[50],
#                                 columns=headers, 
#                                 dtype=object,
#                                 index=['input'])
prediction = clf_test.predict(test_X[50])
print("Prediction: ", prediction)

ValueError: ignored

In [59]:
from sklearn.utils.multiclass import type_of_target
type_of_target(test_y.astype(int))

'multiclass'

In [61]:
Xfs[5]

array([0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])