In [None]:
# read preprocessed data
import pandas as pd

df = pd.read_csv("StackSample1_StemLemm.csv", sep=';')
df.head()

In [None]:
# convert document-word count to Term Frequency - Inverse Document Frequency format

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy

title_vectorizer = TfidfVectorizer(\
	encoding='utf-8-sig',\
	dtype=numpy.float32,\
	max_features=1000	#(include to limit features)
)
x_title = title_vectorizer.fit_transform(df["Lemmatized Title"])

body_vectorizer = TfidfVectorizer(\
	encoding='utf-8-sig',\
	dtype=numpy.float32,\
	max_features=10000 #(include to limit features)
)
x_body = body_vectorizer.fit_transform(df["Lemmatized Body"])

In [None]:
# double weightage for title words
x_title = x_title * 2

# merge rows
from scipy.sparse import hstack

X = hstack([x_title, x_body])
y = df["Tags"]

In [None]:
# split into training and testing datasets (reproducible shuffle)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=10)

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
import os
DATA_DIR = os.environ['USERPROFILE'] + "\Google Drive\CD_Project_Colab"

In [None]:
# dump to pickle files
import joblib

joblib.dump(X_train, f"{DATA_DIR}/x_train_1.pkl")
joblib.dump(X_test, f"{DATA_DIR}/x_test_1.pkl")
joblib.dump(y_train, f"{DATA_DIR}/y_train_1.pkl")
joblib.dump(y_test, f"{DATA_DIR}/y_test_1.pkl")