In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("data/StudentsPerformance.csv")

df.head(11)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [3]:
from sklearn.model_selection import train_test_split

# выделяем вектор признаков
X = df.drop(columns='gender')
# выделяем вектор целевой переменной
y = df['gender']

# Разделим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.preprocessing import OrdinalEncoder

categorical_features = ['race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

for column in categorical_features:
    label_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    
    # Обучаем на тренировочной подвыборке и сразу преобразовываем ее
    X_train[column] = label_encoder.fit_transform(X_train[column].values.reshape(-1, 1))
    
    # Преобразуем тестовую подвыборку с использованием того же обученного кодировщика
    X_test[column] = label_encoder.transform(X_test[column].values.reshape(-1, 1))

X_train

Unnamed: 0,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
29,3.0,3.0,1.0,1.0,62,70,75
535,2.0,1.0,0.0,0.0,66,83,83
695,3.0,4.0,0.0,1.0,79,89,86
557,2.0,3.0,0.0,1.0,61,67,66
836,4.0,2.0,1.0,1.0,73,64,57
...,...,...,...,...,...,...,...
106,3.0,3.0,1.0,1.0,87,100,100
270,2.0,1.0,1.0,1.0,69,63,61
860,2.0,0.0,1.0,1.0,53,62,53
435,2.0,4.0,0.0,0.0,50,48,53


In [5]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [6]:
model = LogisticRegression()

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_pred = model.predict(X_test)

In [8]:
w0 = model.intercept_
w = model.coef_

In [9]:
# Получаем коэффициенты (для n признаков) и смещение
coefficients = model.coef_[0]  # Коэффициенты для признаков
intercept = model.intercept_[0]  # Смещение

# Записываем уравнение разделяющей гиперплоскости
equation = " + ".join([f"{round(coefficients[i], 4)}*x{i+1}" for i in range(len(coefficients))])
equation = f"({round(intercept, 4)}) + " + equation

print(f"Уравнение разделяющей гиперплоскости: {equation}")

Уравнение разделяющей гиперплоскости: (5.3265) + -0.2298*x1 + -0.0305*x2 + -0.7341*x3 + -1.939*x4 + 0.4108*x5 + -0.0277*x6 + -0.4171*x7


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.915
Precision: 0.9215686274509803
Recall: 0.912621359223301
F1-score: 0.9170731707317074


In [11]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("TN:", tn)
print("FP:", fp)
print("FN:", fn)
print("TP:", tp)

TN: 89
FP: 8
FN: 9
TP: 94
