###Imports and declarations

In [0]:
!pip install torch

Collecting torch
  Downloading torch-2.2.2-cp39-cp39-manylinux1_x86_64.whl (755.5 MB)
[?25l[K     |                                | 10 kB 27.0 MB/s eta 0:00:29[K     |                                | 20 kB 35.7 MB/s eta 0:00:22[K     |                                | 30 kB 46.3 MB/s eta 0:00:17[K     |                                | 40 kB 28.9 MB/s eta 0:00:27[K     |                                | 51 kB 21.8 MB/s eta 0:00:35[K     |                                | 61 kB 24.9 MB/s eta 0:00:31[K     |                                | 71 kB 25.2 MB/s eta 0:00:30[K     |                                | 81 kB 27.5 MB/s eta 0:00:28[K     |                                | 92 kB 30.0 MB/s eta 0:00:26[K     |                                | 102 kB 28.5 MB/s eta 0:00:27[K     |                                | 112 kB 28.5 MB/s eta 0:00:27[K     |                                | 122 kB 28.5 MB/s eta 0:00:27[K     |                                | 133 kB 

In [0]:
from pyspark.sql.types import *
import pyspark
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from datetime import datetime, timedelta
import time
import torch

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import (RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, OneHotEncoder,
                                StringIndexer, VectorAssembler, MinMaxScaler, IndexToString)
# %pip install xgboost



In [0]:
pp_train_df = spark.read.parquet("/FileStore/g45/pp_train_df.parquet").drop('num_rows', 'mean_exp_months')
pp_validate_df = spark.read.parquet("/FileStore/g45/pp_validate_df.parquet").drop('num_rows', 'mean_exp_months')


###Preprocessing for NN

In [0]:
x0_x1_columns = [col for col in pp_train_df.columns if col.startswith('x0_') or col.startswith('x1_')]
x2_to_x6_columns = [col for col in pp_train_df.columns if col.startswith('x2_') or col.startswith('x3_') or col.startswith('x4_') or col.startswith('x5_') or col.startswith('x6_')]

feature_sets = [
    ['exp_months', 'total_exp', 'avg_exp', 'last_exp'] + x2_to_x6_columns,
    ['exp_months', 'rating', 'happiness', 'roles', 'company_count'] + x0_x1_columns,
    [col for col in pp_train_df.columns if col not in ['name', 'clean_company_name', 'exp_title', 'company_embeddings', 'title_embeddings']],  # Exclude specific columns
    [col for col in pp_train_df.columns if col not in ['name', 'clean_company_name', 'exp_title']] # Exclude 'name' only
]

feature_set_name = ["Only history", 'Only company reviews', 'All non textual features', 'All features']
feature_tensors = {i: {} for i in feature_set_name}
for i, feature_set in enumerate(feature_sets):
    train_feature_df = pp_train_df.select(*feature_set).toPandas()
    val_feature_df = pp_validate_df.select(*feature_set).toPandas()
    y_train = train_feature_df["exp_months"]
    y_val = val_feature_df["exp_months"]
    X_train = train_feature_df.drop("exp_months", axis=1)
    X_val = val_feature_df.drop("exp_months", axis=1)
    if i == 3:
        exp_title_length = X_train['title_embeddings'].apply(len).max()
        for j in range(exp_title_length):
            X_train[f'title_embeddings_{j}'] =  X_train['title_embeddings'].apply(lambda x: x[j] if i < len(x) else np.nan)
            X_val[f'title_embeddings_{j}'] =  X_val['title_embeddings'].apply(lambda x: x[j] if i < len(x) else np.nan)
        X_train.drop('title_embeddings', axis=1, inplace=True)
        X_val.drop('title_embeddings', axis=1, inplace=True)

        company_len = X_train['company_embeddings'].apply(len).max()
        for j in range(company_len):
            X_train[f'company_embeddings_{j}'] =  X_train['company_embeddings'].apply(lambda x: x[j] if j < len(x) else np.nan)
            X_val[f'company_embeddings_{j}'] =  X_val['company_embeddings'].apply(lambda x: x[j] if j < len(x) else 
                                                                                  np.nan)
        X_train.drop('company_embeddings', axis=1, inplace=True)
        X_val.drop('company_embeddings', axis=1, inplace=True)

        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

        feature_tensors[feature_set_name[i]]['X'] = [X_train_tensor, X_val_tensor]
        feature_tensors[feature_set_name[i]]['y'] = [y_train_tensor, y_val_tensor]
    else:
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
        feature_tensors[feature_set_name[i]]['X'] = [X_train_tensor, X_val_tensor]
        feature_tensors[feature_set_name[i]]['y'] = [y_train_tensor, y_val_tensor]
    print(f"Finished creating tensors for {feature_set_name[i]}")


Finished creating tensors for Only history
Finished creating tensors for Only company reviews
Finished creating tensors for All non textual features
  X_train[f'title_embeddings_{j}'] =  X_train['title_embeddings'].apply(lambda x: x[j] if i < len(x) else np.nan)
  X_val[f'title_embeddings_{j}'] =  X_val['title_embeddings'].apply(lambda x: x[j] if i < len(x) else np.nan)
  X_train[f'company_embeddings_{j}'] =  X_train['company_embeddings'].apply(lambda x: x[j] if j < len(x) else np.nan)
  X_val[f'company_embeddings_{j}'] =  X_val['company_embeddings'].apply(lambda x: x[j] if j < len(x) else
Finished creating tensors for All features


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RegressionNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropout_rate=0.2):
        super(RegressionNN, self).__init__()
        h1, h2, h3, h4 = hidden_sizes
        self.fc1 = nn.Linear(input_size, h1)
        self.bn1 = nn.BatchNorm1d(h1)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(h1, h2)
        self.bn2 = nn.BatchNorm1d(h2)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(h2, h3)
        self.bn3 = nn.BatchNorm1d(h3)
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc4 = nn.Linear(h3, h4)
        self.bn4 = nn.BatchNorm1d(h4)
        self.dropout4 = nn.Dropout(dropout_rate)
        
        self.fc5 = nn.Linear(h4, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.dropout4(x)
        
        x = self.fc5(x)
        return x


###Hypertuning for different feature sets

In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
hypertuning_hidden_size = [
    {'Only history': [128, 64, 32, 16], 'Only company reviews': [128, 64, 32, 16], 'All non textual features': [256, 128, 64, 16], 'All features': [512, 128, 64, 32]}, 
    {'Only history': [512, 128, 64, 32], 'Only company reviews': [512, 128, 64, 32], 'All non textual features': [512, 128, 64, 32], 'All features': [512, 128, 64, 32]},
    {'Only history': [1024, 512, 256, 128], 'Only company reviews': [1024, 512, 256, 128], 'All non textual features': [1024, 512, 256, 128], 'All features': [1024, 512, 256, 128]}]
num_epochs = 25 
feature_set_best_val_mse = {feature_set: float('inf') for feature_set in feature_set_name}  # Track best validation MSE
for feature_to_hidden in hypertuning_hidden_size:
    feature_set_losses, feature_set_val_mse  = {}, {}
    for feature_set in feature_set_name:
        X_train, X_val = feature_tensors[feature_set]['X']
        y_train, y_val = feature_tensors[feature_set]['y']
        
        y_val = y_val.view(-1, 1)

        input_size = X_train.shape[1]
        model = RegressionNN(input_size, feature_to_hidden[feature_set])
        criterion = nn.MSELoss()
        optimizer = torch.optim.SGD(params=model.parameters(), lr=0.002)

        # DataLoader
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

        losses, val_mse = [], []
        # Training Loop
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                targets = targets.view(-1, 1)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            average_loss = total_loss / len(train_loader)
            losses.append(average_loss)
            # Evaluation
            model.eval()
            with torch.no_grad():
                predictions = model(X_val).view(-1, 1).squeeze()
                y_val_squeezed = y_val.squeeze()  # Ensure y_val is properly squeezed for MSE calculation
                mse = mean_squared_error(y_val_squeezed.numpy(), predictions.numpy())
                val_mse.append(mse)

            current_mse = min(val_mse)
            if current_mse < feature_set_best_val_mse[feature_set]:
                feature_set_best_val_mse[feature_set] = current_mse
                # Define model filename with feature set and hidden layer dimensions
                model_filename = f"model_{feature_set.replace(' ', '_')}_{'-'.join(map(str, feature_to_hidden[feature_set]))}.pth"
                # Save the model's state dictionary
                torch.save(model.state_dict(), model_filename)
                print(f"Saved new best model with MSE {current_mse:.4f} as {model_filename}")

        feature_set_losses[feature_set] = losses
        feature_set_val_mse[feature_set] = val_mse   
    print(feature_to_hidden)
    # Plotting the loss for each feature set
    plt.figure(figsize=(12, 8))
    for feature_set, losses in feature_set_losses.items():
        plt.plot(losses, label=f'Train Loss for {feature_set}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss vs. Epoch for Each Feature Set')
    plt.legend()
    plt.show()

    # Plotting the MSE on the validation set for each feature set
    plt.figure(figsize=(12, 8))
    for feature_set, mse in feature_set_val_mse.items():
        plt.plot(mse, label=f'Validation MSE for {feature_set}')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.title('Validation MSE vs. Epoch for Each Feature Set')
    plt.legend()
    plt.show()

    for feature_set in feature_set_name:
        print(f"{feature_set} Training loss = {min(feature_set_losses[feature_set]) ** 0.5}       Validation loss = {min(feature_set_val_mse[feature_set]) ** 0.5}")
    print("=====================================================================================")

###Hypertuning for PCA

In [0]:
from sklearn.decomposition import PCA

feature_tensors_pca = {i: {} for i in feature_set_name}
for feature_set in feature_set_name:
    X_train, X_val = feature_tensors[feature_set]['X'] 
    pca = PCA(n_components=X_train.shape[1] // 2)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    feature_tensors_pca[feature_set]['X'] = X_train_pca, X_val_pca
    feature_tensors_pca[feature_set]['y'] = feature_tensors[feature_set]['y']
    print(f"Performed PCA for {feature_set}")


Performed PCA for Only history
Performed PCA for Only company reviews
Performed PCA for All non textual features
Performed PCA for All features


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
# Assuming RegressionNN and other necessary imports are defined elsewhere

hypertuning_hidden_size = [
    {'Only history': [128, 64, 32, 16], 'Only company reviews': [128, 64, 32, 16], 'All non textual features': [256, 128, 64, 16], 'All features': [512, 128, 64, 32]}, 
    {'Only history': [512, 128, 64, 32], 'Only company reviews': [512, 128, 64, 32], 'All non textual features': [512, 128, 64, 32], 'All features': [512, 128, 64, 32]},
    {'Only history': [1024, 512, 256, 128], 'Only company reviews': [1024, 512, 256, 128], 'All non textual features': [1024, 512, 256, 128], 'All features': [1024, 512, 256, 128]}]
num_epochs = 25 

for feature_to_hidden in hypertuning_hidden_size:
    feature_set_losses, feature_set_val_mse  = {}, {}
    for feature_set in feature_set_name:
        X_train, X_val = feature_tensors_pca[feature_set]['X']
        y_train, y_val = feature_tensors_pca[feature_set]['y']
        
        y_val = y_val.view(-1, 1)

        input_size = X_train.shape[1]  # Adjust input size based on PCA
        model = RegressionNN(input_size, feature_to_hidden[feature_set])
        criterion = torch.nn.MSELoss()  # Ensure nn is prefixed if not imported directly
        optimizer = torch.optim.SGD(params=model.parameters(), lr=0.002)

        # DataLoader using PCA transformed data
        train_dataset = TensorDataset(torch.FloatTensor(X_train), y_train)
        train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

        losses, val_mse = [], []
        # Training Loop
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                targets = targets.view(-1, 1)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            average_loss = total_loss / len(train_loader)
            losses.append(average_loss)
            # Evaluation
            model.eval()
            with torch.no_grad():
                predictions = model(torch.FloatTensor(X_val)).view(-1, 1).squeeze()
                y_val_squeezed = y_val.squeeze()  # Ensure y_val is properly squeezed for MSE calculation
                mse = mean_squared_error(y_val_squeezed.numpy(), predictions.numpy())
                val_mse.append(mse)

        feature_set_losses[feature_set] = losses
        feature_set_val_mse[feature_set] = val_mse   
    print(feature_to_hidden)
    # Plotting the loss for each feature set
    plt.figure(figsize=(12, 8))
    for feature_set, losses in feature_set_losses.items():
        plt.plot(losses, label=f'Train Loss for {feature_set}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss vs. Epoch for Each Feature Set')
    plt.legend()
    plt.show()

    # Plotting the MSE on the validation set for each feature set
    plt.figure(figsize=(12, 8))
    for feature_set, mse in feature_set_val_mse.items():
        plt.plot(mse, label=f'Validation MSE for {feature_set}')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.title('Validation MSE vs. Epoch for Each Feature Set')
    plt.legend()
    plt.show()

    for feature_set in feature_set_name:
        print(f"{feature_set} Training loss = {min(feature_set_losses[feature_set]) ** 0.5}       Validation loss = {min(feature_set_val_mse[feature_set]) ** 0.5}")
    print("=====================================================================================")