In [0]:
import pandas as pd
import lxml.etree as et

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

from sklearn.model_selection import train_test_split as tts

In [0]:
def ngrams(string, N=5):
  ngrams = []
  for n in range(1, N + 1):
    for i in range(len(string)-n+1):
        ngrams.append(string[i:i+n])
  s = ""
  for w in ngrams:
    s += w + " ";
  return s.strip()

In [0]:
def makeNgramData(filename):
  lan = list()
  with open(filename) as f:
    for line in f:
      lan.append(line.strip())
  lan = pd.Series(lan).apply(ngrams)
  data = {"word": lan, "label": 1}
  return pd.DataFrame(data)

In [0]:
def trainModels(lanFile):
  filename = "./eng2.txt"
  eng = list()
  with open(filename) as f:
    for line in f:
      eng.append(line.strip())
  data = {"word":eng, "label": 0}
  data = pd.DataFrame(data)
  # print(data.head(20))

  data = data.append(makeNgramData(lanFile), ignore_index=True)
  
  # Shuffling rows
  data = data.sample(frac=1.0, random_state=5).reset_index(drop=True)

  
  data_x = data['word']
  data_y = data['label']
  
  # # Testing with 20% data
  # x_train, x_test, y_train, y_test = tts(data_x, data_y, test_size=.2, shuffle=True)

  # cv = CountVectorizer()
  # x_train = cv.fit_transform(x_train)
  # tt = TfidfTransformer()
  # x_train = TfidfTransformer().fit_transform(x_train)

  # models = [MultinomialNB(), LogisticRegression(), MLPClassifier(max_iter=10)]
  # f1_scores = []
  # for model in models:
  #   # print(model, "\n")
  #   model = model.fit(x_train, y_train)
  #   y_pred = model.predict(tt.transform(cv.transform(x_test)))
  #   f1_scores.append(metrics.f1_score(y_test, y_pred, labels=['eng', lanFile[:3]]))
  #   # print(metrics.classification_report(y_test, y_pred, target_names=['English', 'Telugu']))
  # print(lanFile[:3].upper(), "F-scores")
  # print(f1_scores)

  # Fitting on total data.
  cv = CountVectorizer()
  tt = TfidfTransformer()
  data_x = cv.fit_transform(data_x)
  data_x = tt.fit_transform(data_x)
  models = [MultinomialNB().fit(data_x, data_y)]#, LogisticRegression().fit(data_x, data_y), MLPClassifier(n_iter_no_change=5).fit(data_x, data_y)]
  return models, cv, tt #, f1_scores

In [108]:
%%time
bin_models = dict()
bin_models["te_en_"] = trainModels("telugu.txt")
bin_models["ml_en_"] = trainModels("malayalamW.txt")
bin_models["bn_en_"] = trainModels("bengaliW.txt")
bin_models["gu_en_"] = trainModels("gujaratiW.txt")
bin_models["hi_en_"] = trainModels("hindiW.txt")
bin_models["kn_en_"] = trainModels("kannadaW.txt")
bin_models["mr_en_"] = trainModels("maratiW.txt")
bin_models["ta_en_"] = trainModels("tamil.txt")

CPU times: user 9.4 s, sys: 69.8 ms, total: 9.47 s
Wall time: 9.49 s


Sentence level classification

In [0]:
def getC(path):
  '''
  Return sequences from the XML file, un-cleaned 
  '''
  tree = et.parse(path, parser=et.XMLParser(recover=True))
  root = tree.getroot()
  data = []
  for i in root.getchildren():
    data.append(i.text.strip())
  data = [i.split(' ') for i in data]
  return data

In [0]:
import string, numpy as np

langs = ['bn', 'en', 'gu', 'hi', 'kn', 'ml', 'mr', 'ta', 'te']

def getSeqNLbl(ipfile, annotfile):
  '''
  Returns sequences and corresponding Labels without redundant words from the given XML files
  '''

  punctuations = [i for i in string.punctuation]

  annot_ = getC(annotfile)
  ip_ = getC(ipfile)

  _seq= []
  _labels = []
  for i in range(len(annot_)):
    tl = [] # Temp list for labels
    ts = [] # Temp list for sequences

    if len(annot_[i]) == len(ip_[i]):
      for l, word in enumerate(ip_[i]):
        if annot_[i][l] != 'X' and np.all([p not in word for p in punctuations]):
          tl.append(annot_[i][l])
          ts.append(word)
      _labels.append(tl)
      _seq.append(' '.join(ts))

  # remove labels, sequences without language tags or with more than two language tags
  e = enumerate(_labels)
  for i, j in e:
    lan = set()
    for l in j:
      if l in langs:
        lan.add(l)
    if len(lan) < 1 or len(lan) > 2 or (len(lan) == 2 and 'en' not in lan):
      _labels.pop(i)
      _seq.pop(i)
  
  return _seq, _labels

def getLangLbl(lbls):
  lbl =[]
  for i in lbls:
    temp = set()
    for j in i:
      if j in langs:
        temp.add(j)
    temp.add('en')
    temp = list(temp)
    temp.remove('en')
    temp.append('en')
    lbl.append('_'.join(temp) + '_')
  return lbl


# Retrieving data
train_seq, train_labels = getSeqNLbl("InputTraining.txt", "AnnotationTraining.txt")
train_lang_lbls = getLangLbl(train_labels)

test_seq, test_labels = getSeqNLbl("InputTesting.txt", "AnnotationTesting.txt")
test_lang_lbls = getLangLbl(test_labels)

In [121]:
sent_labels = {j:i for i,j in enumerate(set(train_lang_lbls))} 

# Copy of training and testing data
X_train, y_train = train_seq[:], train_lang_lbls[:]
X_test, y_test = test_seq[:], test_lang_lbls[:]

# Transforming 
count_vect = CountVectorizer(ngram_range=(1, 5))
tf_idf_transformer = TfidfTransformer()

X_train_trans = tf_idf_transformer.fit_transform(count_vect.fit_transform(X_train))

# Training and testing MLP model
model = MLPClassifier(hidden_layer_sizes=(120)) 
print("MLP CLassification report\n")
model = model.fit(X_train_trans, y_train)
y_pred = model.predict(tf_idf_transformer.transform(count_vect.transform(X_test)))
print(metrics.classification_report(y_test, y_pred))

MLP CLassification report

              precision    recall  f1-score   support

      bn_en_       0.72      0.70      0.71       139
         en_       0.47      0.94      0.62       148
      gu_en_       0.12      0.06      0.08        16
      hi_en_       0.92      0.61      0.73       223
      kn_en_       0.89      0.65      0.75       101
      ml_en_       0.80      0.20      0.32        20
      mr_en_       0.73      0.93      0.82        29
      ta_en_       0.55      0.96      0.70        24
      te_en_       0.85      0.36      0.50        78

    accuracy                           0.67       778
   macro avg       0.67      0.60      0.58       778
weighted avg       0.75      0.67      0.67       778



In [0]:
num2snt_lbl = dict(enumerate(set(train_lang_lbls)))

X_test_seq = [i.split() for i in X_test]
y_test_lbls = test_labels[:]

y_pred_lbls = []
up = []
for i in range(len(y_pred)):
  snt_lbl = y_pred[i]
  if len(y_test_lbls[i]) == len(X_test_seq[i]):
    if snt_lbl in bin_models:
      modelI = bin_models[snt_lbl]
      model = modelI[0][0]
      pred_seq = model.predict(modelI[2].transform(modelI[1].transform([ngrams(w) for w in X_test_seq[i]])))
      pred_seq = [snt_lbl[:2] if i == 1 else 'en' for i in pred_seq]
    else:
      # For 'en_'
      pred_seq = ['en' for i in range(len(X_test_seq[i]))]
    y_pred_lbls.append(pred_seq)
  else:
    up.append(i)
for i in up: # len(up) is equal to 1
  y_test_lbls.pop(i)

In [123]:
lng2num = {j:i for i, j in enumerate(langs)}
num2lng = dict(enumerate(langs))

# Flattening for calculating metrics
y_test_lbls_flat = []
for lngs in y_test_lbls:
  y_test_lbls_flat += lngs

y_pred_lbls_flat = []
for lngs in y_pred_lbls:
  y_pred_lbls_flat += lngs

# print(metrics.f1_score(y_test_lbls_flat, y_pred_lbls_flat, average=None, labels=langs))

print(metrics.classification_report(y_test_lbls_flat, y_pred_lbls_flat, labels=langs))

              precision    recall  f1-score   support

          bn       0.84      0.76      0.80      1363
          en       0.66      0.96      0.79      3970
          gu       0.41      0.10      0.16       185
          hi       0.90      0.69      0.78      1583
          kn       0.87      0.76      0.82       595
          ml       0.94      0.46      0.62       231
          mr       0.82      0.93      0.87       453
          ta       0.91      0.65      0.76       543
          te       0.83      0.32      0.46       529

   micro avg       0.75      0.79      0.77      9452
   macro avg       0.80      0.63      0.67      9452
weighted avg       0.77      0.79      0.76      9452



In [0]:
!pip install sklearn_crfsuite
import sklearn_crfsuite

In [0]:
# Predicting labels using Binary classifiers for the Training sentences

X_train_seq = [i.split() for i in train_seq]
y_train_lbls = train_labels[:]

y_seq_lbls = []
up = []
for i in range(len(y_train)):
  snt_lbl = y_train[i]
  if len(y_train_lbls[i]) == len(X_train_seq[i]):
    if snt_lbl in bin_models:
      modelI = bin_models[snt_lbl]
      model = modelI[0][0]
      pred_seq = model.predict(modelI[2].transform(modelI[1].transform([ngrams(w) for w in X_train_seq[i]])))
      pred_seq = [snt_lbl[:2] if i == 1 else 'en' for i in pred_seq]
    else:
      # For 'en_'
      pred_seq = ['en' for i in range(len(X_train_seq[i]))]
    y_seq_lbls.append(pred_seq)
  else:
    up.append(i)
for i in up:
  y_train_lbls.pop(i)

In [0]:
def word2features(sent, i):
  word = sent[i]

  features = {
    'bias': 1.0,
    'word.lower()': word.lower(),
  }

  if i > 0:
    word1 = sent[i-1]
    features.update({
      '-1:word.lower()': word1.lower(),
    })
  else:
    features['BOS'] = True

  # if i > 1:
  #   word2 = sent[i-2]
  #   features.update({
  #     '-2:word.lower()': word2.lower(),
  #   })

  if i < len(sent)-1:
    word1 = sent[i+1]
    features.update({
      '+1:word.lower()': word1.lower(),
    })
  else:
    features['EOS'] = True

  # if i < len(sent)-2:
  #   word2 = sent[i+2]
  #   features.update({
  #     '+2:word.lower()': word2.lower()
  #   })

  return features

def sent2features(sent):
  return [word2features(sent, i) for i in range(len(sent))]

In [0]:
# Taking predicted sequences as X and actual sequences as y for crf model training
crfX_train = [sent2features(s) for s in y_seq_lbls]
crfy_train = [[i.encode('ascii', 'ignore') for i in j] for j in y_train_lbls]

crfX_test = [sent2features(s) for s in y_pred_lbls]
crfy_test = y_test_lbls[:]

In [126]:
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100,all_possible_transitions=True)

crf.fit(crfX_train, crfy_train)

crf_pred = crf.predict(crfX_test)

from sklearn_crfsuite import metrics as m
print(m.flat_classification_report(crfy_test, crf_pred, digits=3, labels=langs))

              precision    recall  f1-score   support

          bn      0.809     0.824     0.816      1363
          en      0.687     0.917     0.785      3970
          gu      0.355     0.119     0.178       185
          hi      0.873     0.692     0.772      1583
          kn      0.877     0.745     0.805       595
          ml      0.926     0.489     0.640       231
          mr      0.828     0.927     0.875       453
          ta      0.799     0.818     0.808       543
          te      0.683     0.406     0.509       529

   micro avg      0.751     0.795     0.772      9452
   macro avg      0.760     0.660     0.688      9452
weighted avg      0.760     0.795     0.764      9452

