Tải dữ liệu

In [None]:
  from google.colab import files

files = files.upload()

Saving StudentScore.xls to StudentScore.xls


Kiểm tra dữ liệu

In [None]:
import pandas as pd

df = pd.read_csv('StudentScore.xls')
print(df.head())

raw_data.info()


   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


Unnamed: 0,0
gender,2
race/ethnicity,5
parental level of education,6
lunch,2
test preparation course,2


PLA và Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

df = pd.read_csv("StudentScore.xls", delimiter=",")

target='gender'
X = df.drop(target, axis=1)
y = df[target]

#Chia train/test
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

#Tiền xử lý
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

education_values = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
lunch_values = x_train["lunch"].unique()
test_values = x_train["test preparation course"].unique()

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("encoder", OrdinalEncoder(categories=[education_values, lunch_values, test_values]))
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("encoder", OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["reading score", "writing score", "math score"]),
    ("ordinal_features", ord_transformer, ["parental level of education", "lunch", "test preparation course"]),
    ("nominal_features", nom_transformer, ["race/ethnicity"])
])

# Biến đổi dữ liệu
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

# PLA
pla = Perceptron(max_iter=1000, random_state=42)
pla.fit(x_train, y_train)
y_pred_pla = pla.predict(x_test)

print("=== PLA  ===")
print("Accuracy:", accuracy_score(y_test, y_pred_pla))
print("Precision:", precision_score(y_test, y_pred_pla, average="weighted"))
print("Recall:", recall_score(y_test, y_pred_pla, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred_pla, average="weighted"))


# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(x_train, y_train)
y_pred_log = log_reg.predict(x_test)

print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log, average="weighted"))
print("Recall:", recall_score(y_test, y_pred_log, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred_log, average="weighted"))



=== PLA  ===
Accuracy: 0.855
Precision: 0.8623840647370059
Recall: 0.855
F1-score: 0.8537326138177396

=== Logistic Regression ===
Accuracy: 0.895
Precision: 0.8950815734160744
Recall: 0.895
F1-score: 0.8950183975369828


Softmax Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

df = pd.read_csv("StudentScore.xls", delimiter=",")

target='race/ethnicity'
X = df.drop(target, axis=1)
y = df[target]

#Chia train/test
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

#Tiền xử lý
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

education_values = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
gender_values = ["male", "female"]
lunch_values = x_train["lunch"].unique()
test_values = x_train["test preparation course"].unique()

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("encoder", OrdinalEncoder(categories=[education_values,gender_values, lunch_values, test_values]))
])

preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["math score","reading score", "writing score"]),
    ("ordinal_features", ord_transformer, ["parental level of education", "gender", "lunch", "test preparation course"]),
])
# Biến đổi dữ liệu
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

# Softmax Regression
softmax_reg = LogisticRegression(multi_class="multinomial", max_iter=1000, random_state=42)
softmax_reg.fit(x_train, y_train)

y_pred = softmax_reg.predict(x_test)


print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted", zero_division=0)}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted", zero_division=0)}')
print(f'F1: {f1_score(y_test, y_pred, average="weighted", zero_division=0)}')

Accuracy: 0.385
Recall: 0.385
Precision: 0.30274367293964954
F1: 0.32227845328075366


