### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## preprocessing - NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

## workflow
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

## models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

## metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
%run 00_Workflow_Functions.ipynb import na_only

In [3]:
# recovering train and test data
%store -r stored_data

In [4]:
X_train = stored_data['X_train']
X_test = stored_data['X_test']

y_train_encoded = stored_data['y_train_encoded']
y_test_encoded = stored_data['y_test_encoded']

### Vectorizing Data

For our first model iteration, we will do a simple vectorization of text data. We will then evaluate our model performance and further engineer our text features.

In [5]:
cvec = CountVectorizer()

In [6]:
cvec.fit_transform(X_train).A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).A, columns=cvec.get_feature_names_out())
X_test_cvec = pd.DataFrame(cvec.transform(X_test).A, columns=cvec.get_feature_names_out())
X_train_cvec.head(5)

Unnamed: 0,00,000,00001,026_007_plumb_repair,03,03am,04,054fd209,08,087,...,還是因為距離,還會陷入愛情嗎,那些懵懂而青澀的過往,那些沒有結果的感情,那個你本以為忘記的人,都成眷屬,都會實現,都會結果,都有可能改變你的生活,餘生
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
print(X_train_cvec.shape, X_test_cvec.shape)

(1857, 12373) (619, 12373)


Both train and test splits have been successfully vectorized.