# Summary

## Preparation

In [None]:
# import statements

from functions.basic import read_file
from functions.summary import apply_scaling

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
#datasets

j_all = read_file("data\\3_sentiment_data\\j_all_3.pkl")
k_all = read_file("data\\3_sentiment_data\\k_all_3.pkl")

## Dataset Cleanup & Preparation

In [None]:
# drop all columns that are no further needed

j_all_prep = j_all.drop(columns=["cit_num", "sentiws", "germansentiment"])
j_all_prep = j_all_prep.rename(columns={"rel_sentiws": "sentiws", "germansentiment_mapped": "germansentiment"})

k_all_prep = k_all.drop(columns=["cit_num", "sentiws", "germansentiment"])
k_all_prep = k_all_prep.rename(columns={"rel_sentiws": "sentiws", "germansentiment_mapped": "germansentiment"})

#df cols that scaling should not be applied to:
not_scaled = ["startpos", "endpos", "text", "passage_type","sentiws", "germansentiment"]

# J
j_all_not_scaled_data = [j_all_prep["startpos"], j_all_prep["endpos"], j_all_prep["text"], j_all_prep["passage_type"], j_all_prep["sentiws"], j_all_prep["germansentiment"]]
j_all_not_scaled = pd.concat(j_all_not_scaled_data, axis=1, keys=not_scaled)
j_all_scaling = j_all_prep.drop(columns=not_scaled)

# K
k_all_not_scaled_data = [k_all_prep["startpos"], k_all_prep["endpos"], k_all_prep["text"], k_all_prep["passage_type"],k_all_prep["sentiws"], k_all_prep["germansentiment"]]
k_all_not_scaled = pd.concat(k_all_not_scaled_data, axis=1, keys=not_scaled)
k_all_scaling = k_all_prep.drop(columns=not_scaled)



In [None]:
#scale all columns in []_all_scaling to [0,1]

for df in [j_all_scaling, k_all_scaling]:
    for col in df.columns:
        df = apply_scaling(df, col, "zero_pos")

In [None]:
# combine scaled and not scaled dataframes back together

j_all_scaled = pd.concat([j_all_scaling, j_all_not_scaled],axis=1)
k_all_scaled = pd.concat([k_all_scaling, k_all_not_scaled],axis=1)

j_all_scaled

In [None]:
# save data sets

#j_all_scaled.to_json("j_all_4.json", orient="index")
#k_all_scaled.to_json("k_all_4.json", orient="index")

In [None]:
# combine J and K datasets

all_prep = j_all_prep
all_prep = pd.concat([all_prep, k_all_prep], ignore_index=True)

all_pred = all_prep.drop(columns=["startpos", "endpos", "text", "passage_type", "token_count","frequency"])
passage_type = all_prep.passage_type

In [None]:
all_pred

## Decision Tree

In [None]:
# split data

X = all_pred.to_numpy()
y = passage_type

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# apply decision tree

clf = tree.DecisionTreeClassifier(max_depth = 3,random_state = 0)  # set max_depth to a small number like 3 to avoid overfitting
clf.fit(X_train, y_train)  

In [None]:
# https://towardsdatascience.com/visualizing-decision-trees-with-python-scikit-learn-graphviz-matplotlib-1c50b4aa68dc
# visualize tree

fn= all_pred.columns
cn=["cited", "not_cited"]

fig, axes = plt.subplots(nrows = 1,ncols = 1,)

vis = tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               rounded=True, )

#fig.savefig('tree.pdf', format='pdf')

In [None]:
# evaluate results

y_pred = clf.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
#print(cm)
print(metrics.classification_report(y_test, y_pred, labels=["cited", "not_cited"]))#

## KNN

In [None]:
# apply kNN using parameter n_neighbors
knn = KNeighborsClassifier(n_neighbors=12)

# fit model and predict y for training data
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# evaluate results

cm = metrics.confusion_matrix(y_test, y_pred_knn)
#print(cm)
print(metrics.classification_report(y_test, y_pred_knn, labels=["cited", "not_cited"]))