# Linear Models: LogisticRegression, SGDClassifier, LinearSVC  

In [1]:
# increase the width of the notebook
from IPython.display import display, HTML, Markdown

display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score

## Separate features and target

In [3]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train["Score"]
y_test = test["Score"]

X_train = train.drop("Score", axis=1)
X_test = test.drop("Score", axis=1)

## Transformations

In [4]:
#Preprocessing pipelines
numeric_features = ["WhiteElo", "EloDif"]
categorical_features = ["Opening_name", "Time_format", "Increment_binary"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [5]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed  = preprocessor.transform(X_test)

##  Logistic Regression and GridSearch

In [6]:
lr = LogisticRegression ( solver="lbfgs",multi_class="multinomial", max_iter=1000, random_state=42)

In [7]:
param_lr = {"C": [0.01, 0.1, 1, 10]}
grid_search_lr = GridSearchCV(lr, param_lr, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train_transformed, y_train)

GridSearchCV(cv=3,
             estimator=LogisticRegression(max_iter=1000,
                                          multi_class='multinomial',
                                          random_state=42),
             n_jobs=-1, param_grid={'C': [0.01, 0.1, 1, 10]},
             scoring='accuracy')

In [8]:
print("Best params:", grid_search_lr.best_params_)
best_lr = grid_search_lr.best_estimator_

Best params: {'C': 10}


In [9]:
y_pred = best_lr.predict(X_test_transformed)
print("Test accuracy: {:.2f}%".format(100 * accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

Test accuracy: 53.77%
              precision    recall  f1-score   support

   Black Win       0.53      0.44      0.48      4524
        Draw       0.00      0.00      0.00       566
   White Win       0.54      0.69      0.61      4910

    accuracy                           0.54     10000
   macro avg       0.36      0.38      0.36     10000
weighted avg       0.51      0.54      0.52     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### The logistic regression baseline is doing a bit better than a completely naïve guesser, but it still struggles badly with the minority “draw” class.

In [10]:
import joblib
joblib.dump(best_lr, "best_logistic_regression_model.joblib")

['best_logistic_regression_model.joblib']

## SGDClassifier and GridSearch

In [11]:
sgd = SGDClassifier(random_state=42)

In [12]:
param_sgd = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1],              
    'max_iter': [1000, 2000],
}

In [13]:
grid_sgd = GridSearchCV(
    sgd,
    param_sgd,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
)

In [14]:
grid_sgd.fit(X_train_transformed, y_train)

GridSearchCV(cv=3, estimator=SGDClassifier(random_state=42), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1],
                         'max_iter': [1000, 2000]},
             scoring='accuracy')

In [15]:
print("Best params:", grid_sgd.best_params_)
best_sgd = grid_sgd.best_estimator_

Best params: {'alpha': 0.1, 'max_iter': 1000}


In [16]:
y_pred = best_sgd.predict(X_test_transformed)
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Test accuracy: 0.54

Classification Report:
               precision    recall  f1-score   support

   Black Win       0.55      0.32      0.40      4524
        Draw       0.00      0.00      0.00       566
   White Win       0.53      0.80      0.64      4910

    accuracy                           0.54     10000
   macro avg       0.36      0.37      0.35     10000
weighted avg       0.51      0.54      0.50     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Our SGDClassifier is doing a bit better than pure chance, but still only modestly above a trivial baseline

In [17]:
joblib.dump(best_sgd, "best_sgd.joblib")

['best_sgd.joblib']

## LinearSVC and GridSearch

In [18]:
svc = LinearSVC(random_state=42, dual=False)

In [19]:
param_svc = {
    'C': [0.1, 1, 10],
    'max_iter': [1000, 2500, 5000, 10000]
}

In [20]:
grid_svc = GridSearchCV(
    svc,
    param_svc,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
)

In [21]:
grid_svc.fit(X_train_transformed, y_train)

GridSearchCV(cv=3, estimator=LinearSVC(dual=False, random_state=42), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10],
                         'max_iter': [1000, 2500, 5000, 10000]},
             scoring='accuracy')

In [22]:
print("Best params:", grid_svc.best_params_)
best_svc = grid_svc.best_estimator_

Best params: {'C': 0.1, 'max_iter': 1000}


In [23]:
y_pred = best_svc.predict(X_test_transformed)
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Test accuracy: 0.54

Classification Report:
               precision    recall  f1-score   support

   Black Win       0.53      0.43      0.48      4524
        Draw       0.00      0.00      0.00       566
   White Win       0.54      0.70      0.61      4910

    accuracy                           0.54     10000
   macro avg       0.36      0.38      0.36     10000
weighted avg       0.51      0.54      0.51     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
joblib.dump(best_svc, "best_linear_svc_model.joblib")

['best_linear_svc_model.joblib']

### LinearSVC has a similar poor performance as our previous models. All models score around 54%.