## Baseline

In [31]:
# Import statements
import numpy as np
import pandas as pd
import csv
import xml.etree.ElementTree as ET
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


### Read in Data

In [32]:
input_xml = '../Data/train.xml'
output_csv = 'orth_context.csv'

# Call the function
train_df = pd.read_csv('../Code/train_orth_context.csv')
train_df.head(n=50)

test_df = pd.read_csv('../Code/test_orth_context.csv')
test_df.head(n=50)

Unnamed: 0,Prev3,Prev2,Prev1,Current,Next1,Next2,Next3,Class
0,_,_,_,rozumiem,",",że,olechowskiego,fin
1,_,_,rozumiem,",",że,olechowskiego,",",interp
2,_,rozumiem,",",że,olechowskiego,",",który,comp
3,rozumiem,",",że,olechowskiego,",",który,był,subst
4,",",że,olechowskiego,",",który,był,wtedy,interp
5,że,olechowskiego,",",który,był,wtedy,po,adj
6,olechowskiego,",",który,był,wtedy,po,zgoła,praet
7,",",który,był,wtedy,po,zgoła,innej,adv
8,który,był,wtedy,po,zgoła,innej,stronie,prep
9,był,wtedy,po,zgoła,innej,stronie,",",qub


### Training Function

In [33]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    # The fit function trains the model (clf)
    # X_train is a 2d array of the features: each row represents a datapoint, each column represents a feature
    # y_train is a 1d array of labels. The nth value of the array is the label for the nth row in X_train
    clf.fit(X_train, y_train)
    
    # generating predictions for unseen data
    y_pred = clf.predict(X_test)

    # calculate the precision, recall and f1 scores to evaluate the classifiers performance on the test data
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

In [34]:
y_train = train_df['Class']
X_train = train_df.drop(columns=['Class']).values

y_test = test_df['Class']
X_test = test_df.drop(columns=['Class']).values

### Convert to Numeric Value 

In [35]:
from sklearn import preprocessing
import numpy as np

# Ordinal encoders assign unique integers to each unique value in your dataset
# It is important to tell an encoder how to handle an unknown value, 
le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# learn to assign numeric codes for the unique values in X_vals
le.fit(X_train.ravel().reshape(-1, 1))

# convert the words in our features into numeric codes
X_train_enc = le.transform(X_train.ravel().reshape(-1, 1))
X_test_enc = le.transform(X_test.ravel().reshape(-1, 1))

# reshape X (our numerically coded features) so it is the original 2d shape that we had before
X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)



In [36]:
print(X_train[0])
print(X_train_enc[0])
print(X_train_enc.shape)

['_' '_' '_' 'zabiję' 'cię' ',' 'jeśli']
[ 1600  1600  1600 85817  9188    19 23059]
(728070, 7)


### Run Classifiers

In [39]:
# Construct the classifiers at hand prior to folding the data through them
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), 
               DecisionTreeClassifier(random_state=0)]

# Initialize a list to store the results
results = []

# Try different classifiers
for name, clf in zip(names, classifiers):
    print(f'Now classifying {name}')
    f1, precision, recall = buildClassifiers(clf, X_train_enc, X_test_enc, y_train, y_test)
    
    # Store the results in the list
    results.append([name, f1, precision, recall])

# Convert the results into a pandas DataFrame for easy tabular display
results_df = pd.DataFrame(results, columns=["Classifier", "F1 Score", "Precision", "Recall"])

# Display the results as a table
results_df

Now classifying Naive_Bayes
Now classifying Decision_Tree


Unnamed: 0,Classifier,F1 Score,Precision,Recall
0,Naive_Bayes,0.358997,0.358997,0.358997
1,Decision_Tree,0.771593,0.771593,0.771593


### Run Classifiers

In [38]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = clf.predict(X_test_enc)
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         adj       0.70      0.65      0.67     25402
        adja       0.09      0.11      0.10        89
        adjc       0.36      0.50      0.42        18
        adjp       0.44      0.56      0.49       156
         adv       0.77      0.72      0.74      8395
        aglt       0.95      0.89      0.92      1599
      bedzie       0.97      0.94      0.95       582
        brev       0.79      0.78      0.79      2123
        burk       0.32      0.43      0.37        21
        comp       0.93      0.93      0.93      3554
        conj       0.90      0.80      0.85      9167
        depr       0.14      0.19      0.16        27
         fin       0.74      0.71      0.72     12048
         ger       0.43      0.35      0.39      2357
        imps       0.27      0.26      0.26       399
        impt       0.30      0.37      0.34       491
         inf       0.58      0.53      0.55      3864
      interj       0.28    