<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>

# Scikit-learn Logistic Regression Model
# Scikit-learn 逻辑回归模型

In [1]:
!python download-prepare-dataset.py

100% | 80.23 MB | 4.37 MB/s | 18.38 sec elapsed

In [14]:
# 导入pandas库用于数据处理
import pandas as pd

# 读取训练集数据
train_df = pd.read_csv("train.csv")
# 读取验证集数据 
val_df = pd.read_csv("validation.csv")
# 读取测试集数据
test_df = pd.read_csv("test.csv")

In [16]:
# 显示训练数据集的前5行样本
train_df.head()

Unnamed: 0,text,label
0,"The only reason I saw ""Shakedown"" was that it ...",0
1,"This is absolute drivel, designed to shock and...",0
2,Lots of scenes and dialogue are flat-out goofy...,1
3,** and 1/2 stars out of **** Lifeforce is one ...,1
4,I learned a thing: you have to take this film ...,1


## Scikit-learn baseline
## Scikit-learn 基准模型

In [17]:
# 从sklearn导入CountVectorizer用于文本特征提取
from sklearn.feature_extraction.text import CountVectorizer
# 从sklearn导入LogisticRegression用于逻辑回归分类
from sklearn.linear_model import LogisticRegression
# 从sklearn导入accuracy_score用于计算准确率
from sklearn.metrics import accuracy_score

In [20]:
# 创建CountVectorizer对象用于文本特征提取
vectorizer = CountVectorizer()

# 对训练集文本进行特征提取和转换
X_train = vectorizer.fit_transform(train_df["text"])
# 使用训练好的vectorizer对验证集文本进行转换
X_val = vectorizer.transform(val_df["text"]) 
# 使用训练好的vectorizer对测试集文本进行转换
X_test = vectorizer.transform(test_df["text"])

# 获取训练集、验证集和测试集的标签
y_train, y_val, y_test = train_df["label"], val_df["label"], test_df["label"]

In [22]:
# 定义评估函数,接收模型和训练、验证、测试数据集作为参数
def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # 使用模型对训练集进行预测
    y_pred_train = model.predict(X_train)
    # 使用模型对验证集进行预测
    y_pred_val = model.predict(X_val)
    # 使用模型对测试集进行预测
    y_pred_test = model.predict(X_test)
    
    # 计算训练集的准确率
    accuracy_train = accuracy_score(y_train, y_pred_train)
    # 计算训练集的平衡准确率
    balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    
    # 计算验证集的准确率
    accuracy_val = accuracy_score(y_val, y_pred_val)
    # 计算验证集的平衡准确率
    balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)

    # 计算测试集的准确率
    accuracy_test = accuracy_score(y_test, y_pred_test)
    # 计算测试集的平衡准确率
    balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)
    
    # 打印训练集准确率
    print(f"Training Accuracy: {accuracy_train*100:.2f}%")
    # 打印验证集准确率
    print(f"Validation Accuracy: {accuracy_val*100:.2f}%")
    # 打印测试集准确率
    print(f"Test Accuracy: {accuracy_test*100:.2f}%")

In [23]:
# 从sklearn导入DummyClassifier用于创建基准分类器
from sklearn.dummy import DummyClassifier

# 创建一个DummyClassifier实例,使用"预测最频繁类别"的策略
dummy_clf = DummyClassifier(strategy="most_frequent") 

# 使用训练数据拟合分类器
dummy_clf.fit(X_train, y_train)

# 评估分类器在训练集、验证集和测试集上的性能
eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)

Training Accuracy: 50.01%
Validation Accuracy: 50.14%
Test Accuracy: 49.91%


In [24]:
# 创建逻辑回归模型,设置最大迭代次数为1000
model = LogisticRegression(max_iter=1000)
# 使用训练数据拟合模型
model.fit(X_train, y_train)
# 评估模型在训练集、验证集和测试集上的性能
eval(model, X_train, y_train, X_val, y_val, X_test, y_test)

Training Accuracy: 99.80%
Validation Accuracy: 88.62%
Test Accuracy: 88.85%
