## Improved

In [45]:
# Import statements
import numpy as np
import pandas as pd
import csv
import xml.etree.ElementTree as ET
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


### Read in Data

In [46]:
input_xml = '../Data/train.xml'
output_csv = 'orth_context.csv'

# Call the function
train_df = pd.read_csv('../Code/improved_train_orth_context.csv')
train_df.head(n=50)

test_df = pd.read_csv('../Code/improved_test_orth_context.csv')
test_df.head(n=50)

Unnamed: 0,Prev2,Prev1,Current,Next1,Next2,Class,Base
0,_,_,rozumiem,",",że,fin,rozumieć
1,_,rozumiem,",",że,olechowskiego,interp,","
2,rozumiem,",",że,olechowskiego,",",comp,że
3,",",że,olechowskiego,",",który,subst,olechowski
4,że,olechowskiego,",",który,był,interp,","
5,olechowskiego,",",który,był,wtedy,adj,który
6,",",który,był,wtedy,po,praet,być
7,który,był,wtedy,po,zgoła,adv,wtedy
8,był,wtedy,po,zgoła,innej,prep,po
9,wtedy,po,zgoła,innej,stronie,qub,zgoła


### Training Function

In [47]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    # The fit function trains the model (clf)
    # X_train is a 2d array of the features: each row represents a datapoint, each column represents a feature
    # y_train is a 1d array of labels. The nth value of the array is the label for the nth row in X_train
    clf.fit(X_train, y_train)
    
    # generating predictions for unseen data
    y_pred = clf.predict(X_test)

    # calculate the precision, recall and f1 scores to evaluate the classifiers performance on the test data
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

In [48]:
y_train = train_df['Class']
X_train = train_df.drop(columns=['Class']).values

y_test = test_df['Class']
X_test = test_df.drop(columns=['Class']).values

### Convert to Numeric Value 

In [49]:
from sklearn import preprocessing
import numpy as np

# Ordinal encoders assign unique integers to each unique value in your dataset
# It is important to tell an encoder how to handle an unknown value, 
le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# learn to assign numeric codes for the unique values in X_vals
le.fit(X_train.ravel().reshape(-1, 1))

# convert the words in our features into numeric codes
X_train_enc = le.transform(X_train.ravel().reshape(-1, 1))
X_test_enc = le.transform(X_test.ravel().reshape(-1, 1))

# reshape X (our numerically coded features) so it is the original 2d shape that we had before
X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)



  X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)


In [50]:
print(X_train[0])
print(X_train_enc[0])
print(X_train_enc.shape)

['_' '_' 'zabiję' 'cię' ',' 'zabić']
[  1734   1734 101205  10906     22 101213]
(728070, 6)


### Run Classifiers

In [51]:
# Construct the classifiers at hand prior to folding the data through them
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), 
               DecisionTreeClassifier(random_state=0)]

# Initialize a list to store the results
results = []

# Try different classifiers
for name, clf in zip(names, classifiers):
    print(f'Now classifying {name}')
    f1, precision, recall = buildClassifiers(clf, X_train_enc, X_test_enc, y_train, y_test)
    
    # Store the results in the list
    results.append([name, f1, precision, recall])

# Convert the results into a pandas DataFrame for easy tabular display
results_df = pd.DataFrame(results, columns=["Classifier", "F1 Score", "Precision", "Recall"])

# Display the results as a table
results_df

Now classifying Naive_Bayes
Now classifying Decision_Tree


Unnamed: 0,Classifier,F1 Score,Precision,Recall
0,Naive_Bayes,0.380902,0.380902,0.380902
1,Decision_Tree,0.820714,0.820714,0.820714


### Classification Report

In [52]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = clf.predict(X_test_enc)
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         adj       0.73      0.75      0.74     25402
        adja       0.19      0.26      0.22        89
        adjc       0.52      0.72      0.60        18
        adjp       0.50      0.68      0.58       156
         adv       0.77      0.77      0.77      8395
        aglt       0.98      0.97      0.98      1599
      bedzie       0.96      0.97      0.96       582
        brev       0.60      0.81      0.69      2123
        burk       0.70      0.67      0.68        21
        comp       0.93      0.94      0.94      3554
        conj       0.91      0.84      0.87      9167
        depr       0.12      0.15      0.14        27
         fin       0.79      0.77      0.78     12048
         ger       0.53      0.47      0.49      2357
        imps       0.32      0.29      0.31       399
        impt       0.32      0.44      0.37       491
         inf       0.64      0.57      0.61      3864
      interj       0.30    

## Feature Engineering


In [53]:
# Initialize a list to store results for each classifier
all_performance_diffs = []

# Loop through each classifier
for name, clf in zip(names, classifiers):
    
    # Store the baseline performance with all features for this classifier
    baseline_f1, baseline_precision, baseline_recall = buildClassifiers(clf, X_train_enc, X_test_enc, y_train, y_test)
    # Initialize a dictionary to hold performance differences for each feature
    performance_diff = { "Feature": [], "F1 Difference": [], "Precision Difference": [], "Recall Difference": [] }
    
    # Loop through each feature, remove it, and evaluate the model
    for i, feature in enumerate(feature_names):
        # Drop the ith feature from the training and testing sets
        if  i != 2: 
            X_train_loo = np.delete(X_train_enc, i, axis=1)
            X_test_loo = np.delete(X_test_enc, i, axis=1)
            
            # Re-evaluate the classifier without the ith feature
            f1, precision, recall = buildClassifiers(clf, X_train_loo, X_test_loo, y_train, y_test)

            # Calculate performance differences
            f1_diff = baseline_f1 - f1
            precision_diff = baseline_precision - precision
            recall_diff = baseline_recall - recall
            
            # Append the results
            performance_diff["Feature"].append(feature)
            performance_diff["F1 Difference"].append(f1_diff)
            performance_diff["Precision Difference"].append(precision_diff)
            performance_diff["Recall Difference"].append(recall_diff)
    
    # Convert to DataFrame and add classifier name for easy identification
    performance_diff_df = pd.DataFrame(performance_diff)
    performance_diff_df["Classifier"] = name
    all_performance_diffs.append(performance_diff_df)

# Concatenate results from all classifiers into a single DataFrame
final_performance_diff_df = pd.concat(all_performance_diffs, ignore_index=True)
final_performance_diff_df


Unnamed: 0,Feature,F1 Difference,Precision Difference,Recall Difference,Classifier
0,Prev2,0.001708,0.001708,0.001708,Naive_Bayes
1,Prev1,0.001894,0.001894,0.001894,Naive_Bayes
2,Next1,0.008151,0.008151,0.008151,Naive_Bayes
3,Next2,0.00042,0.00042,0.00042,Naive_Bayes
4,Prev2,-0.014408,-0.014408,-0.014408,Decision_Tree
5,Prev1,-0.026005,-0.026005,-0.026005,Decision_Tree
6,Next1,-0.055613,-0.055613,-0.055613,Decision_Tree
7,Next2,-0.012861,-0.012861,-0.012861,Decision_Tree
