In [1]:
##UTILITY FUNCTIONS

In [2]:
import numpy as np
import pandas as pd
import warnings
from typing import List, Tuple, Dict


def split_data(X, y, test_size=0.1, seed=None):
    # shuffle data
    np.random.seed(seed)
    perm = np.random.permutation(X.index)
    X = X.loc[perm]
    y = y.loc[perm]
    
    # split into training and test sets
    n_samples = X.shape[0]
    if isinstance(test_size, float):
        if test_size <= 0 or test_size >= 1:
            raise ValueError("The test size should fall in the range (0,1)")
        n_train = n_samples - round(test_size*n_samples)
    elif isinstance(test_size, int):
        n_train = n_samples - test_size
    else:
        raise ValueError("Improper type \'%s\' for test_size" % type(test_size))

    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test

def encode_one_hot(data): # note: pd.get_dummies(df) does the same
    # https://www.kite.com/python/answers/how-to-do-one-hot-encoding-with-numpy-in-python
    one_hot = np.zeros((data.size, data.max()+1))
    rows = np.arange(data.size)
    one_hot[rows, data] = 1
    return one_hot


def confusion_matrix(y_actual, y_pred):
    """ Returns a confusion matrix where the rows are the actual classes, and the columns are the predicted classes"""
    if y_actual.shape != y_pred.shape:
        raise ValueError ("input arrays must have the same shape, {}!={}".format(y_actual.shape, y_pred.shape))
    n = max(max(y_actual), max(y_pred)) + 1
    C = np.zeros((n, n), dtype=int)
    for label_actual in range(n):
        idxs_true = (y_actual == label_actual)
        for label_pred in range(n):
            C[label_actual, label_pred] = sum(y_pred[idxs_true] == label_pred)
    return C

def calc_f1_score(y_actual, y_pred) -> Tuple[float]:
    C = confusion_matrix(y_actual, y_pred)
    if C.shape[0] != 2:
        raise ValueError ("input arrays must only have binary values")
    recall    = C[1][1]/(C[1][0]+C[1][1]) #true positive/actual positive
    precision = C[1][1]/(C[0][1]+C[1][1]) #true positive/predicted positive
    if (recall == 0) or (precision == 0):
        f1 = 0
    else:
        f1 = 2 * recall*precision/(recall + precision) # = 2/((1/recall)+(1/precision))

    print("Recall: {:.4f}".format(recall))
    print("Precision: {:.4f}".format(precision))
    print("F1 Score: {:.4f}".format(f1))


In [3]:
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept

    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        self.theta = np.zeros(X.shape[1])

        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient

    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        return self.__sigmoid(np.dot(X, self.theta))

    def predict(self, X, threshold=0.5):
        return (self.predict_prob(X) >= threshold).astype(int)

    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
      



In [4]:
#CHESS_PREDICTION

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
col=['White_King_file','White_King_rank','White_Rook_file','White_Rook_rank','Black_King_file','Black_King_rank','Result']
df=pd.read_csv('/content/krkopt.data',sep=',',names=col)
df.head()

Unnamed: 0,White_King_file,White_King_rank,White_Rook_file,White_Rook_rank,Black_King_file,Black_King_rank,Result
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw


In [7]:
df.shape

(28056, 7)

In [8]:
# TARGET DISTRIBUTION 
print(df['Result'].value_counts().sort_index) 

<bound method Series.sort_index of fourteen    4553
thirteen    4194
twelve      3597
eleven      2854
draw        2796
fifteen     2166
ten         1985
nine        1712
eight       1433
seven        683
six          592
five         471
sixteen      390
two          246
four         198
three         81
one           78
zero          27
Name: Result, dtype: int64>


In [9]:
df.isnull().sum()

White_King_file    0
White_King_rank    0
White_Rook_file    0
White_Rook_rank    0
Black_King_file    0
Black_King_rank    0
Result             0
dtype: int64

In [10]:
# Define a dictionary to map each unique category to a unique integer value
white_king_map = {val: idx for idx, val in enumerate(df['White_King_file'].unique())}
white_rook_map = {val: idx for idx, val in enumerate(df['White_Rook_file'].unique())}
black_king_map = {val: idx for idx, val in enumerate(df['Black_King_file'].unique())}
result_map = {val: idx for idx, val in enumerate(df['Result'].unique())}

# Use the map to replace each categorical value with its corresponding integer value
df['White_King_file'] = df['White_King_file'].map(white_king_map)
df['White_Rook_file'] = df['White_Rook_file'].map(white_rook_map)
df['Black_King_file'] = df['Black_King_file'].map(black_king_map)
df['Result'] = df['Result'].map(result_map)


In [11]:
# Separating dependent-independent variables
X = df.drop('Result',axis=1)
y = df['Result']

In [12]:
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2)

In [13]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((22445, 6), (22445,), (5611, 6), (5611,))

In [14]:
# Instantiate and fit a logistic regression model
lr = LogisticRegression(lr=0.1, num_iter=5000, fit_intercept=True)
lr.fit(X_train, y_train)

# Evaluate the model on the test set
acc_test = lr.score(X_test, y_test)
acc_train = lr.score(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_pred = lr.predict(X_test)

In [15]:
print("Logistic Regression model")
print("train accuracy: %.2f%%" % (acc_train*100))
print("test accuracy:  %.2f%%" % (acc_test*100))

Logistic Regression model
train accuracy: 0.09%
test accuracy:  0.11%


In [17]:
  # Define grid search function
def grid_search(X, y, learning_rates, n_iters_values):
        best_acc = 0
        best_lr = None
        best_n_iters = None

        for lr in learning_rates:
            for n_iters in n_iters_values:
              lr = LogisticRegression(lr=0.5, num_iter=5000, fit_intercept=True)
              lr.fit(X, y)
              y_pred = lr.predict(X)
              acc = sum(y_pred == y) / len(y)
              if acc > best_acc:
                best_acc = acc
                best_lr = lr
                best_n_iters = n_iters

            return best_lr, best_n_iters, best_acc

In [18]:

# Define hyperparameter search space
learning_rates = [0.05, 0.5, 5]
n_iters_values = [100, 1000, 10000]

In [19]:
best_lr, best_n_iters, best_acc= grid_search(X, y, learning_rates, n_iters_values)
print("Best score:", best_acc)
print("Best learning_rate:", best_lr)
print("Best n_iters:", best_n_iters)

Best score: 0.000962360992301112
Best learning_rate: <__main__.LogisticRegression object at 0x7fdc41b364f0>
Best n_iters: 100
