In [8]:
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.sparse import hstack


In [9]:
df = pd.read_csv("../dataset/cooked/cleaned_data.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,ageEstimate,companyFollowerCount,companyName,companyStaffCount,connectionsCount,country,endDate,followersCount,isPremium,mbrLocation,mbrLocationCode,mbrTitle,posLocation,posLocationCode,posTitle,startDate,avgMemberPosDuration,avgCompanyPosDuration
0,0,41.0,198859.0,commonwealth bank,32905.0,500.0,au,,506.0,0.0,"Sydney Area, Australia","urn:li:fs_region:(au,4910)",portfolio executive at commonwealth bank,,,portfolio executive,2014-07-01,760.5,989.9361
1,1,41.0,198859.0,commonwealth bank,32905.0,500.0,au,2014-06-01,506.0,0.0,"Sydney Area, Australia","urn:li:fs_region:(au,4910)",portfolio executive at commonwealth bank,"sydney, australia","urn:li:fs_region:(au,4910)",solution delivery executive,2013-11-01,760.5,989.9361
2,2,41.0,10047.0,commsec,619.0,500.0,au,2012-12-01,506.0,0.0,"Sydney Area, Australia","urn:li:fs_region:(au,4910)",portfolio executive at commonwealth bank,,,project manager,2008-08-01,760.5,747.2308
3,3,41.0,198859.0,commonwealth bank,32905.0,500.0,au,2008-07-01,506.0,0.0,"Sydney Area, Australia","urn:li:fs_region:(au,4910)",portfolio executive at commonwealth bank,,,project manager,2007-02-01,760.5,989.9361
4,4,30.0,300723.0,paypal,22522.0,500.0,au,,951.0,0.0,"Sydney Area, Australia","urn:li:fs_region:(au,4910)","senior marketing manager, paypal",,,senior marketing manager,2017-01-01,395.2857,683.3496


In [10]:
text_cols = ["mbrTitle", "posTitle", "companyName", "posLocation"]

df[text_cols] = df[text_cols].fillna("").astype(str)


In [11]:
df["combined_text"] = (
    df["mbrTitle"] + " " +
    df["posTitle"] + " " +
    df["companyName"] + " " +
    df["posLocation"]
)


In [12]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df["combined_text"])


In [13]:
numeric_cols = [
    "ageEstimate",
    "connectionsCount",
    "followersCount",
    "companyFollowerCount",
    "companyStaffCount",
    "avgMemberPosDuration",
    "avgCompanyPosDuration"
]

categorical_cols = [
    "Unnamed: 0",
    "ageEstimate",
    "companyFollowerCount",
    "companyName",
    "companyStaffCount",
    "connectionsCount",
    "country",
    "endDate",
    "followersCount",
    "isPremium",
    "mbrLocation",
    "mbrLocationCode",
    "mbrTitle",
    "posLocation",
    "posLocationCode",
    "posTitle",
    "startDate",
    "avgMemberPosDuration",
    "avgCompanyPosDuration"
]

scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(df[numeric_cols])



In [14]:
encoder = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=True
)

categorical_features = encoder.fit_transform(df[categorical_cols])


In [15]:
final_features = hstack([
    tfidf_matrix,
    numeric_features,
    categorical_features
])


In [16]:
pickle.dump(
    tfidf_vectorizer,
    open("../dataset/cooked/tfidf_vectorizer.pkl", "wb")
)

pickle.dump(
    final_features,
    open("../dataset/cooked/tfidf_matrix.pkl", "wb")
)



In [17]:
with open("../dataset/cooked/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(final_features, f)


In [18]:
with open("../dataset/cooked/tfidf_matrix.pkl", "rb") as f:
    final_features = pickle.load(f)


In [19]:
type(final_features)


scipy.sparse._coo.coo_matrix