### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## preprocessing - NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

## workflow
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

## models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

## metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
%run 00_Workflow_Functions.ipynb import na_only

In [3]:
# recovering train and test data
%store -r stored_data

In [4]:
X_train = stored_data['X_train']
X_test = stored_data['X_test']

y_train_encoded = stored_data['y_train_encoded']
y_test_encoded = stored_data['y_test_encoded']

### Vectorizing Data

For our first model iteration, we will do a simple vectorization of text data. We will then evaluate our model performance and further engineer our text features.

In [5]:
cvec = CountVectorizer()

In [8]:
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).A, columns=cvec.get_feature_names_out())
X_test_cvec = pd.DataFrame(cvec.transform(X_test).A, columns=cvec.get_feature_names_out())
X_train_cvec.head(5)

Unnamed: 0,00,000,00001,026_007_plumb_repair,03,03am,04,054fd209,08,087,...,還是因為距離,還會陷入愛情嗎,那些懵懂而青澀的過往,那些沒有結果的感情,那個你本以為忘記的人,都成眷屬,都會實現,都會結果,都有可能改變你的生活,餘生
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
print(X_train_cvec.shape, X_test_cvec.shape)

(1857, 12373) (619, 12373)


Both train and test splits have been successfully vectorized. We have generated 12,373 unique strings of characters to use as features.

### Baseline Model

Let's create a baseline model to compare our results.

In [12]:
dc = DummyClassifier()

In [14]:
dc.fit(X_train_cvec, y_train_encoded)

DummyClassifier()

In [19]:
base_preds = dc.predict(X_test_cvec)

In [23]:
# accuracy score
dc.score(X_train_cvec, y_train_encoded), dc.score(X_test_cvec, y_test_encoded)

(0.7587506731287023, 0.7592891760904685)

75.9% of our predictions correctly predicted if a posts belongs to `r/LifeProTips` or `r/lifehacks`.

In [24]:
# sensitivity
recall_score(y_test_encoded, base_preds)

0.0

The proportion of correctly predicted `r/lifehacks` posts over actual `r/lifehacks` posts. <br />
In this case we did not correctly predict any posts belong to `r/lifehacks`.

In [21]:
# specificity
recall_score(y_test_encoded, base_preds, pos_label=0)

1.0

The proportion of correctly predicted `r/LifeProTips` posts over actual `r/LifeProTips` posts. <br />
In this case we correctly predicted all `r/LifeProTips` posts that belong to `r/LifeProTips`.

Given that our recall score is `0`, and our specifity score is `1`, our baseline model simply predicted all posts in the dataset to belong to `r/LifeProTips`! That means we incorrectly predicted about `30%` of our data, hence our accuracy score of 0.759.

In [25]:
# precision
precision_score(y_test_encoded, base_preds)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

The proportion of correct `r/lifehacks` predictions over all `r/lifehacks` predictions. <br />
Since we did not predict any post to belong to `r/lifehacks`, our precision score cannot be calculated (can't divide by zero).

### Logistic Regression

Alias: Kronk

In [34]:
lr = LogisticRegression(max_iter=1_000, random_state=14)

In [35]:
lr.fit(X_train_cvec, y_train_encoded)

LogisticRegression(max_iter=1000, random_state=14)

In [36]:
print(lr.score(X_train_cvec, y_train_encoded), lr.score(X_test_cvec, y_test_encoded))

0.9983844911147012 0.7366720516962844


In [37]:
kronk_preds =lr.predict(X_test_cvec)

In [38]:
# sensitivity
recall_score(y_test_encoded, kronk_preds)

0.35570469798657717

In [39]:
# specificity
recall_score(y_test_encoded, kronk_preds, pos_label=0)

0.8574468085106383

In [40]:
precision_score(y_test_encoded, kronk_preds)

0.44166666666666665

Verdict: Bad! Worse than our baseline in terms of accuracy.

### K-Nearest-Neighbors

Alisas: Kuzko

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)

### Logsitic Regression v2.0

Alias: Kuzko

In [41]:
lr_pipe = Pipeline(
    [
        ('lr', LogisticRegression(max_iter=1_000))
    ]
)

In [None]:
rs = Rand