In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [2]:
from pathlib import Path
import re
import operator as op
from collections import Counter


def read_file(file_path, new_filename):
    file_path = Path(file_path)
    new_file = open(new_filename, 'w')
    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []

    filler = []
    pet = []
    ctr=1
    for doc in raw_docs:
        tokens = []
        tags = []
        repeat = 0
        for line in doc.split('\n'):
            if len(line.strip()) == 0:
                continue
            try: 
                token, tag = line.split('\t')
            except:
                continue
            if tag[tag.find('-')+1:] == 'filler_R':
                filler.append(token)
                tag = 'O'
            elif tag[tag.find('-')+1:] == 'pet_R':
                pet.append(token)
                tag = 'O'
            elif tag[tag.find('-')+1:] == 'repeat_R':
                tag = 'O'
                repeat = 1
            if repeat == 1 and tag[tag.find('-')+1:] == 'Alteration':
                tag = 'O'
            tokens.append(token)
            tags.append(tag)
            new_file.write("Sentence: "+str(ctr)+'\t'+token+'\t'+tag+'\n')

        token_docs.append(tokens)
        tag_docs.append(tags)
        ctr=ctr+1
        new_file.write('\n')

    print(len(filler), len(set(filler)), set(filler))
    print(len(pet), len(set(pet)), set(pet))

    return filler, pet

In [3]:
filler_train, pet_train = read_file('/path/to/train/data/', 'train.tsv')
dev_texts, pet_test = read_file('/path/to/dev/data/', 'dev.tsv')

673 43 {'హే', 'హహ...', 'ఎస్', 'విధంగా', 'హ..', 'నా', 'య', 'అదేవిధంగా', '...', 'సీ', ',', 'నో', 'ఆ.', 'హహహ', 'లాట్సాప్', 'ఇది', 'హ్మ్మ్', '..', 'ఏంటి', 'హహ', 'ఈ', '.', 'అంటే', 'సో', 'అసలు', 'ఆహ్', 'అది', 'హ.', 'ఓ', 'హహ.', 'యు', 'ఉహ్', 'ఒక', 'లైక్', 'ఏ', 'ఆ', 'ఉ', 'ప్చ్', '....', 'సొ', 'అదే', 'ఎ', 'హ'}
968 30 {'మరి', 'దిస్', 'మీకు', 'నొ', 'య', 'ఏంటంటే', ',', 'కెన్', 'నో', 'మీక', 'టే', 'సి', '..', '.', 'రైట్', 'దట్', 'సో', 'అసలు', 'సరే', 'అది', 'హ.', 'నేను', 'యు', 'ఐ', 'ఆ', 'చూసిగా', 'అం', 'మీన్', 'బట్', 'యా'}
62 4 {'హ్మ్మ్', 'ప్చ్', 'ఆహ్', 'ఉహ్'}
125 6 {'టే', 'మరి', 'సరే', 'అది', 'అం', 'అసలు'}


In [4]:
colnames=['Sentence #','Word', 'Tag']
train = pd.read_csv('train.tsv', delimiter='\t', encoding = "utf-8", quoting=csv.QUOTE_NONE, names=colnames, header=None)
dev = pd.read_csv('dev.tsv', delimiter='\t', encoding = "utf-8", quoting=csv.QUOTE_NONE, names=colnames, header=None)

In [5]:
train = train.fillna(method='ffill')
dev = dev.fillna(method='ffill')

In [6]:
frames = [train, dev]
df = pd.concat(frames)

In [7]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-Alteration,1408
1,B-edit_R,314
2,B-false_R,345
3,B-repair_R,261
4,I-Alteration,180
5,I-edit_R,419
6,I-false_R,564
7,I-repair_R,859
8,O,52767


In [8]:
X = df.drop('Tag', axis=1)

In [9]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))

In [10]:
y = df.Tag.values

In [11]:
classes = np.unique(y)
classes = classes.tolist()
classes

['B-Alteration',
 'B-edit_R',
 'B-false_R',
 'B-repair_R',
 'I-Alteration',
 'I-edit_R',
 'I-false_R',
 'I-repair_R',
 'O']

In [12]:
X.shape, y.shape

((57117, 19026), (57117,))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [14]:
new_classes = classes.copy()
new_classes.remove('O')
new_classes

['B-Alteration',
 'B-edit_R',
 'B-false_R',
 'B-repair_R',
 'I-Alteration',
 'I-edit_R',
 'I-false_R',
 'I-repair_R']

In [15]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [16]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

B-Alteration       0.60      0.54      0.57       466
    B-edit_R       0.08      0.14      0.10       104
   B-false_R       0.00      0.00      0.00       127
  B-repair_R       0.00      0.00      0.00        91
I-Alteration       0.07      0.12      0.09        68
    I-edit_R       0.29      0.36      0.32       146
   I-false_R       0.09      0.16      0.12       170
  I-repair_R       0.48      0.55      0.52       285

   micro avg       0.29      0.35      0.32      1457
   macro avg       0.20      0.24      0.21      1457
weighted avg       0.34      0.35      0.34      1457



In [17]:
def find_repeated_sequences(s):
    match = re.findall(r'((\b.+?\b)(?:\s\2)+)', s)
    return [(m[1], int((len(m[0]) + 1) / (len(m[1]) + 1))) for m in match]

In [18]:
def read_test_file(file_path, new_filename):
    file_path = Path(file_path)
    new_file = open(new_filename, 'w')
    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = {}
    tag_docs = []
    ctr=1
    for doc in raw_docs:
        tokens = ''
        tags = []
        repeat = 0
        for line in doc.split('\n'):
            if len(line.strip()) == 0:
                continue
            tokens = tokens + line + ' '
            new_file.write("Sentence: "+str(ctr)+'\t'+line+'\n')
            if line == '.':
                
                #print(tokens)
                r = find_repeated_sequences(tokens)                
                tokens = ''
                token_docs["Sentence: "+str(ctr)] = r
                ctr=ctr+1
        new_file.write('\n')
        
    return token_docs
        

In [19]:
labs= read_test_file('/path/to/test/data/', 'test.tsv')

In [20]:
test = pd.read_csv('test.tsv', delimiter='\t', encoding = "utf-8", quoting=csv.QUOTE_NONE, names=['Sentence #','Word'], header=None)

In [21]:
test = test.fillna(method='ffill')

In [22]:
X_t = v.transform(test.to_dict('records'))

In [23]:
test.to_dict('records')

[{'Sentence #': 'Sentence: 1', 'Word': 'నీట్'},
 {'Sentence #': 'Sentence: 1', 'Word': 'జాతీయ'},
 {'Sentence #': 'Sentence: 1', 'Word': 'ర్యాంకుల్లో'},
 {'Sentence #': 'Sentence: 1', 'Word': 'తెలుగు'},
 {'Sentence #': 'Sentence: 1', 'Word': 'రాష్ట్రాల'},
 {'Sentence #': 'Sentence: 1', 'Word': 'విద్యార్థులు'},
 {'Sentence #': 'Sentence: 1', 'Word': 'సత్తా'},
 {'Sentence #': 'Sentence: 1', 'Word': 'చాటారు'},
 {'Sentence #': 'Sentence: 1', 'Word': ','},
 {'Sentence #': 'Sentence: 1', 'Word': 'తెలంగాణ'},
 {'Sentence #': 'Sentence: 1', 'Word': 'విద్యార్థిని'},
 {'Sentence #': 'Sentence: 1', 'Word': '""""'},
 {'Sentence #': 'Sentence: 1', 'Word': 'జి'},
 {'Sentence #': 'Sentence: 1', 'Word': 'మాధురి'},
 {'Sentence #': 'Sentence: 1', 'Word': 'రెడ్డి'},
 {'Sentence #': 'Sentence: 1', 'Word': '""""'},
 {'Sentence #': 'Sentence: 1', 'Word': 'ఆరు'},
 {'Sentence #': 'Sentence: 1', 'Word': 'వందల'},
 {'Sentence #': 'Sentence: 1', 'Word': 'తొంభై'},
 {'Sentence #': 'Sentence: 1', 'Word': 'ఐదు'},
 {'Se

In [26]:
result = nb.predict(X_t)
s = 'Sentence: 1'
done = []
new = open('result.tsv', 'w')
for i, record in enumerate(test.to_dict('records')):
    sent = record['Sentence #']
    word = record['Word']
    tag  = result[i]

    if word in filler_train:
        tag = 'B-filler_R'
    if word in pet_train:
        tag = 'B-pet_R'

        
    if word == '""':
        new.write('""'+'\n')
    else:
        new.write(word+'\t'+tag+'\n')
    #print(record['Word'], result[i])

In [25]:
len(test.to_dict('records'))

10239