In [31]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
train_data = pd.read_csv('./train.csv')
train_ans = pd.read_csv('./train_answers.csv')

In [5]:
train_ans = train_ans.iloc[:,1]
y = np.repeat(train_ans.to_numpy(), 4)

First, we'll test out classic stochastic gradient descent classification on the first ~40 features. These features are **tortuousity**, which describes the "twistedness" of the handwriting. 

In [98]:
train_tort_data = train_data.iloc[:, 4:44]

x = train_tort_data.to_numpy()

In [99]:
print(np.shape(x))
print(np.shape(y))

print(f'\nThere are {(y==0).sum()} male writers.')
print(f'\nThere are {(y==1).sum()} male writers.')

(1128, 40)
(1128,)

There are 572 male writers.

There are 556 male writers.


In [103]:
model = SGDClassifier()
model.fit(X=x, y=y)

SGDClassifier()

In [104]:
answers = model.predict(X=x)

In [105]:
print((answers != y).sum())
print(answers[:32])
print(y[:32])

439
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Here, we get 61% accuracy (439 misclassifications over 1128 samples). We also print the first 32 predicted answers, and their corresponding ground truths. This result is alright, but we decided to try to use other features to try and achieve a better result. 

Now, we attempted to use the next 855 features, describing the **curviness** of the handwriting. We wanted to test which descriptor of handwriting was accurate when it came to accurately classifying the gender of the author.

In [106]:
train_curve_data = train_data.iloc[:, 54:900]
x = train_curve_data.to_numpy()

In [107]:
print(np.shape(x))
print(np.shape(y))

(1128, 846)
(1128,)


In [108]:
model.fit(X=x, y=y)

SGDClassifier()

In [109]:
answers = model.predict(X=x)

In [110]:
print((answers != y).sum())
print(answers[:32])
print(y[:32])

499
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Here, we got very similar results to our attempt of SGD with tortuousity features. We wanted to give one other set of features a try before writing off SGD entirely.

These next ~4000 features are called "chaincode". We weren't sure what that meant, but we inferred that it referred to how letters connected to each other.

In [111]:
train_chain_data = train_data.iloc[:, 901:5020]
x = train_chain_data.to_numpy()

In [112]:
print(np.shape(x))
print(np.shape(y))

(1128, 4119)
(1128,)


In [113]:
model.fit(X=x, y=y)

SGDClassifier()

In [114]:
answers = model.predict(X=x)

In [70]:
print((answers != y).sum())
print(model.score(X=x, y=y))
print(answers[:32])
print(y[:32])

392
0.648936170212766
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Here, our results were much worse than the previous two feature sets, however, there was a lot of variance every time we ran classification, so we chalked it up to SGD being a poor fit for this use case.

Now, we tried to use **Random Forest** classification. We felt like this would be a good use case because of its similarity to traditional Decision Tree methods. In addition, to keep track of potential overfitting, we also implemented cross validation on the training set, using a 25/75 split.

In [6]:
train_chain_data = train_data.iloc[:, 901:5020]
x = train_chain_data.to_numpy()

In [7]:
model = RandomForestClassifier(n_estimators=250, min_samples_leaf=3)
model.fit(X=x, y=y)
answers = model.predict(X=x)
print((answers != y).sum())

0


Here, with 0 misclassifications, we must make sure that we are not overfitting with this method.

In [8]:
test_idx = np.arange(3, 1128, 4)
train_idx = np.delete(np.arange(1128), test_idx)

x_test = x[test_idx, :]
x_train = x[train_idx, :]
y_train = np.repeat(train_ans.to_numpy(), 3)
y_test = train_ans

In [9]:
model.fit(X=x_train, y=y_train)

RandomForestClassifier(min_samples_leaf=3, n_estimators=250)

In [11]:
print(model.score(X=x_train, y=y_train))
print((model.predict(X=x_train) != y_train).sum())
print('\n')
print(model.score(X=x_test, y=y_test))
print((model.predict(X=x_test) != y_test).sum())

1.0
0


0.7127659574468085
81


This disparity in accuracy between the training accuracy and test accuracy confirms that we are overfitting, however our base accuracy of 70% is a bit better than our earlier accuracies of around 65%.

We also thought that the **Logistic Regression** method could be a good fit for this dataset, due to the form of the features a range of numbers rather than specific categories. We implemented it using the SGDClassifier interface from sk-learn with the 'log' loss function.

In [95]:
train_chain_data = train_data.iloc[:, 4:]
x = train_chain_data.to_numpy()

In [101]:
model = SGDClassifier(penalty='elasticnet', l1_ratio=0.3, alpha=10**-7,
                      loss='log', learning_rate='adaptive', eta0=1.25, average=True)
model.fit(X=x, y=y)
answers = model.predict(X=x)
print((answers != y).sum())
print(model.score(X=x, y=y))

306
0.7287234042553191


In [97]:
test_idx = np.arange(3, 1128, 4)
train_idx = np.delete(np.arange(1128), test_idx)

x_test = x[test_idx, :]
x_train = x[train_idx, :]
y_train = np.repeat(train_ans.to_numpy(), 3)
y_test = train_ans

In [98]:
model.fit(X=x_train, y=y_train)

SGDClassifier(alpha=1e-07, average=True, eta0=1.25, l1_ratio=0.3,
              learning_rate='adaptive', loss='log', penalty='elasticnet')

In [99]:
print(model.score(X=x_train, y=y_train))
print((model.predict(X=x_train) != y_train).sum())
print('\n')
print(model.score(X=x_test, y=y_test))
print((model.predict(X=x_test) != y_test).sum())

0.7269503546099291
231


0.7127659574468085
81
