# Basic System

This notebook provides code for implementing a very simple machine learning system for named entity recognition.
It uses logistic regression and one feature (the token itself).
Links to information about the packages are provided. Your job is to document the code and use it to train a system. You can then use your evaluation code to provide the first basic evaluation of your system.
In the next assignment, you can use this as a basis to experiment with more features and more machine learning methods.

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import sys
import re
import string

# If you want to include other modules, you can add them here
# Please note the recommendations on using modules in the Programming General Guidelines

#recommended resource for examples:

#https://scikit-learn.org/stable/modules/feature_extraction.html

In [75]:
def is_capital(word):
    return int(bool(re.search("^[A-Z]", word)))

In [167]:
def extract_features_and_labels(trainingfile):
    
    data = []
    targets = []
    with open(trainingfile, 'r', encoding='utf8') as infile:
        is_prev_word_period = True # set to true so that the first word of the file is considered for
        # being 'the first word' check.
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                
                is_first_word = False
                if is_prev_word_period and not bool(re.search('^[_\W]*?$', components[0])):
                    is_first_word = True
                    
                token, pos = components[0:2]
                is_word_capital = is_capital(components[0])
                feature_dict = {'token':token, 'pos': pos, 'is_first_word': is_first_word, 
                                'is_capital': is_word_capital}
                data.append(feature_dict)
                #gold is in the last column
                targets.append(components[-1])
                
                is_prev_word_period = bool(components[0] == '.') and bool(components[-1] == 'O')
                    
    return data, targets

In [168]:
def extract_features(inputfile):
   
    data = []
    with open(inputfile, 'r', encoding='utf8') as infile:
        is_prev_word_period = True # set to true so that the first word of the file is considered for
        # being 'the first word' check.
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                
                is_first_word = False
                if is_prev_word_period and not bool(re.search('^[_\W]*?$', components[0])):
                    is_first_word = True
                    
                token, pos = components[0:2]
                is_word_capital = is_capital(components[0])
                feature_dict = {'token':token, 'pos': pos, 'is_first_word': is_first_word, 
                                'is_capital': is_word_capital}
                data.append(feature_dict)
                is_prev_word_period = bool(components[0] == '.') and bool(components[-1] == 'O')
    return data

In [169]:
def create_classifier(train_features, train_targets):
   
    logreg = LogisticRegression()
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    print(features_vectorized.shape)
    print(vec.get_feature_names_out())
    model = logreg.fit(features_vectorized, train_targets)
    
    return model, vec

In [170]:
def classify_data(model, vec, inputdata, outputfile):
  
    features = extract_features(inputdata)
    features = vec.transform(features)
    predictions = model.predict(features)
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()

In [171]:
def main(argv=None):
    
    #a very basic way for picking up commandline arguments
    if argv is None:
        argv = sys.argv
        
    #Note 1: argv[0] is the name of the python program if you run your program as: python program1.py arg1 arg2 arg3
    #Note 2: sys.argv is simple, but gets messy if you need it for anything else than basic scenarios with few arguments
    #you'll want to move to something better. e.g. argparse (easy to find online)
    
    
    #you can replace the values for these with paths to the appropriate files for now, e.g. by specifying values in argv
    #argv = ['mypython_program','','','']
    trainingfile = argv[1]
    inputfile = argv[2]
    outputfile = argv[3]
    
    training_features, gold_labels = extract_features_and_labels(trainingfile)
    
    ml_model, vec = create_classifier(training_features, gold_labels)
#     classify_data(ml_model, vec, inputfile, outputfile)

# uncomment this when using this in a script    
    
#if __name__ == '__main__':
#    main()

In [172]:
# remember that the first element of the list is not used 
# (since this is the `python command when the args are read from sys.argv)
# make sure to complete the rest of the list assigned to args correctly
args = ['python', "../../data/conll2003.train.conll", "../../data/conll2003.dev.conll", "prediction.txt"]
main(args)

(203621, 23670)
['is_capital' 'is_first_word' 'pos="' ... 'token=zlotys' 'token=zone'
 'token=zvezda']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
