In [0]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, size, array_remove, expr, array_contains, array, lit, arrays_zip,  when, first, udf, concat_ws, to_json, flatten
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import BertEmbeddings, Tokenizer
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import Pipeline
import time
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np
import pandas as pd
import json
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import BooleanType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
!pip install torch

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-b4821e76-36f9-4357-b7d9-77295cabacbf/bin/python -m pip install --upgrade pip' command.[0m


Import and clean data

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
profiles = spark.read.parquet('/linkedin/people')

In [0]:
Israeli_Profiles = pd.read_csv("/dbfs/FileStore/shared_uploads/raph@campus.technion.ac.il/Israeli_Profiles-2.csv")
Israeli_Profiles = Israeli_Profiles.dropna(subset=['current_company_company_id'])

# Convert pandas DataFrame to Spark DataFrame
israeli_profiles = spark.createDataFrame(Israeli_Profiles)

deleted_columns = ['url', 'locale', 'name', 'avatar', 'followers', 'connections', 'posts',
                   'recommendations', 'canonical_url', 'publications', 'locations', 'country_code',
                   'input_url', 'warning', 'warning_code', 'error', 'current_company']

Israeli_Profiles_clean = israeli_profiles.drop(*deleted_columns)


def extract_info_udf(json_str, info_type):
    if json_str is None:
        return None

    extracted_info = []
    try:
        data = json.loads(json_str)
        if isinstance(data, list):
            for item in data:
                if info_type == "projects":
                    title = item.get('title')
                    duration = item.get('duration')
                    description = item.get('description')
                    extracted_info.append(f"{title} ({duration}): {description}")
                elif info_type == "patents":
                    title = item.get('title')
                    extracted_info.append(f"{title}")
    except json.JSONDecodeError:
        pass
    
    return extracted_info

# Register the UDF
extract_info = F.udf(lambda json_str, info_type: extract_info_udf(json_str, info_type), returnType=ArrayType(StringType()))

# Apply the UDF to the DataFrame for projects
extracted_info_df = Israeli_Profiles_clean.withColumn(
    "project_info", extract_info("projects", F.lit("projects"))
)

# Apply the UDF to the DataFrame for patents
extracted_info_df = extracted_info_df.withColumn(
    "patent_info", extract_info("patents", F.lit("patents"))
)

# Add IDs to DataFrames
Israeli_Profiles_clean_with_id = Israeli_Profiles_clean.withColumn("id", monotonically_increasing_id())
final_info_df_with_id = extracted_info_df.withColumn("id", monotonically_increasing_id())

# Join DataFrames using the unique identifiers
joined_df = Israeli_Profiles_clean_with_id.join(final_info_df_with_id.select("id", "project_info", "patent_info"), "id", "left_outer")

# Drop unnecessary columns
final_israeli_joined_df = joined_df.drop("id", "projects", "patents").filter(col("current_company_company_id").isNotNull())



In [0]:
final_info_df.display()

project_info,patent_info
,
,
,
,
,
,
,
,
,
,


In [0]:
cleanProfiles = profiles.select("about", 
                        col("certifications.title").alias("certifications_title"), 
                        "city", 
                        col("current_company.industry").alias("current_company_industry"), 
                        col("current_company.title").alias("current_company_title"), 
                        col("current_company.company_id").alias("current_company_id"), 
                        col("current_company.name").alias("current_company_name"), 
                        "education.degree", 
                        "education.field", 
                        col("experience.company").alias("experience_company"), 
                        col("experience.description").alias("experience_description"), 
                        col("experience.duration").alias("experience_duration"), 
                        col("experience.positions.title").alias("positions_title"), 
                        col("experience.title").alias("experience_title"), 
                        "id", 
                        col("languages.subtitle").alias("languages_level"), 
                        col("languages.title").alias("languages"), 
                        "position", 
                        "recommendations_count")

Now, we need to remove all the rows in cleanProfiles that dont have lables

In [0]:
def contains_only_null(arr):
    if arr is None:
        return False
    for val in arr:
        if val is not None:
            return False
    return True

# Register UDF
contains_only_null_udf = udf(contains_only_null, BooleanType())

# Filtering condition
filtered_df = cleanProfiles.filter(
    (col("current_company_id").isNotNull()) |
    (~contains_only_null_udf("experience_company"))
)
remove_nulls = filtered_df.withColumn("experience_company", expr("filter(experience_company, x -> x is not null)"))
filteredProfiles = remove_nulls.filter(size(col("experience_company")) != 0)

Now we need to add a label column. a label is:

the current company
  if the profile doesnt have a current company, themn it is its previues company ( data is cleaned in a way so at least one is guranteed ).

In [0]:
@udf(StringType())
def combine_columns(current_company_id, experience_company):
    if current_company_id is not None:
        return current_company_id
    elif experience_company is not None and len(experience_company) > 0:
        return experience_company[0]
    else:
        return None

# Add new column using UDF
readyProfiles = filteredProfiles.withColumn("label", combine_columns(filteredProfiles.current_company_id, filteredProfiles.experience_company))

Beginning EVA

In [0]:
df = readyProfiles

In [0]:
df = df.withColumn("certifications_title", concat_ws(";", col("certifications_title"))) \
                   .withColumn("degree", concat_ws(";", col("degree"))) \
                   .withColumn("field", concat_ws(";", col("field"))) \
                   .withColumn("experience_company", concat_ws(";", col("experience_company"))) \
                   .withColumn("experience_description", concat_ws(";", col("experience_description"))) \
                   .withColumn("experience_duration", concat_ws(";", col("experience_duration"))) \
                   .withColumn("positions_title", concat_ws(";", flatten(col("positions_title")))) \
                   .withColumn("experience_title", concat_ws(";", col("experience_title"))) \
                   .withColumn("languages_level", concat_ws(";", col("languages_level"))) \
                   .withColumn("languages", concat_ws(";", col("languages")))

# Converting array columns to string columns separated by ';'
final_israeli_joined_df = final_israeli_joined_df.withColumn("region", concat_ws(";", col("region"))) \
                   .withColumn("position", concat_ws(";", col("position"))) \
                   .withColumn("educations_details", concat_ws(";", col("educations_details"))) \
                   .withColumn("recommendations_count", concat_ws(";", col("recommendations_count"))) \
                   .withColumn("current_company_name", concat_ws(";", col("current_company_name"))) \
                   .withColumn("current_company_company_id", concat_ws(";", col("current_company_company_id"))) \
                   .withColumn("project_info", concat_ws(";", col("project_info"))) \
                   .withColumn("patent_info", concat_ws(";", col("patent_info")))
final_israeli_joined_df = final_israeli_joined_df.withColumn('label', concat_udf('current_company_company_id'))

In [0]:
def combine_features(*values):
    return ' ;'.join(map(str, values))

def mean_vector(embeddings):
    words_embeddings = [word.embeddings for word in embeddings]
    embeddings_array = np.array(words_embeddings)
    if embeddings_array.size == 0:
        return Vectors.dense([])
    mean_embedding = np.mean(embeddings_array, axis=0)
    return Vectors.dense(mean_embedding.tolist())

mean_vector_udf = udf(mean_vector, VectorUDT())
concat_udf = udf(combine_features, StringType())

columns_to_combine = [col for col in df.columns if col != "label"]
columns_to_combine_israeli = [col for col in final_israeli_joined_df.columns if col != "label"]

In [0]:
df = df.withColumn('concatenated_text', concat_udf(*columns_to_combine))
df = df.withColumn('concatenated_label', concat_udf('label'))

final_israeli_joined_df = final_israeli_joined_df.withColumn('concatenated_text', concat_udf(*columns_to_combine_israeli))
final_israeli_joined_df = final_israeli_joined_df.withColumn('concatenated_label', concat_udf('label'))

In [0]:
document_assembler = DocumentAssembler()\
    .setInputCol("concatenated_text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained(name="small_bert_L2_128", lang="en")\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB
[ | ][OK!]


In [0]:
pipeline = Pipeline().setStages([document_assembler, tokenizer, bert_embeddings])
pipeline_israeli = Pipeline().setStages([document_assembler, tokenizer, bert_embeddings])

fitted_pipeline_israeli = pipeline_israeli.fit(final_israeli_joined_df)
fitted_pipeline = pipeline.fit(df)

processed_features = fitted_pipeline.transform(df)
processed_features_israeli = fitted_pipeline_israeli.transform(final_israeli_joined_df)

final = processed_features.withColumn("mean_embeddings", mean_vector_udf("embeddings"))
final_israeli = processed_features_israeli.withColumn("mean_embeddings", mean_vector_udf("embeddings"))

In [0]:
readydf = final.select(col("concatenated_label").alias("label"), col("mean_embeddings").alias("features"))
readydf_israeli = final_israeli.select(col("concatenated_label").alias("label"), col("mean_embeddings").alias("features"))

In [0]:
readydf = readydf.withColumn('features', vector_to_array('features'))
readydf_israeli = readydf_israeli.withColumn('features', vector_to_array('features'))

In [0]:
windowSpec = Window.partitionBy("label")
df_with_count = readydf.withColumn("label_count", F.count("label").over(windowSpec))
df_with_count_israeli = readydf_israeli.withColumn("label_count", F.count("label").over(windowSpec))

# Now, filter the DataFrame to keep only rows with labels that occur more than once
filtered_df = df_with_count.filter(col("label_count") > 4).drop("label_count")
filtered_df_israeli = df_with_count_israeli.filter(col("label_count") > 4).drop("label_count")

In [0]:
pdf_israeli = filtered_df_israeli.toPandas()

In [0]:
pdf_israeli

Unnamed: 0,label,features
0,4m-analytics,"[-1.6014958847136724, -0.5573168041716728, -0...."
1,4m-analytics,"[-0.7555547985045806, 0.3190507866928111, -0.0..."
2,4m-analytics,"[-1.1667294127068348, -0.9156654881579536, -0...."
3,4m-analytics,"[-1.557504834799931, -0.0739575208582241, -0.4..."
4,4m-analytics,"[-1.1995340954210307, -0.38181648342996033, -0..."
...,...,...
1092,zone7ai,"[-1.1773088642142036, -0.6362856812775135, -0...."
1093,zone7ai,"[-1.5285623893141747, -0.039684190725286804, -..."
1094,zone7ai,"[-1.360643278219198, -1.0163621925993969, -0.7..."
1095,zone7ai,"[-1.7080026865005493, -0.3067014639576276, -0...."


In [0]:
pdf = filtered_df.toPandas()

Only the scraping data:

In [0]:
pandas_df_israeli = pdf_israeli.copy()

In [0]:
pandas_df = pdf.copy()

KNN

In [0]:
UniqueLabels = pandas_df_israeli['label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(UniqueLabels)}
pandas_df_israeli['label'] = pandas_df_israeli['label'].map(label_to_idx)

In [0]:
# Splitting the data into features (X) and labels (y)
X = np.array(pandas_df_israeli['features'].tolist())
y = np.array(pandas_df_israeli['label'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:

# Initializing the K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=19, weights="distance")

# Fitting the classifier to the training data
knn_classifier.fit(X_train, y_train)

# Predicting labels for the test set
y_pred = knn_classifier.predict(X_test)

# Calculating evaluation metrics
accuracy, precision, recall, f1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')

# Printing evaluation metrics
print("Weights:", w, "| K:", k, "| Accuracy:", accuracy, "| Precision:", precision, "| Recall:", recall, "| F1-score:", f1)


Weights: distance | K: 5 | Accuracy: 0.55 | Precision: 0.6543939393939394 | Recall: 0.55 | F1-score: 0.5580952380952381
Weights: distance | K: 7 | Accuracy: 0.55 | Precision: 0.6678030303030302 | Recall: 0.55 | F1-score: 0.5627272727272727
Weights: distance | K: 9 | Accuracy: 0.5272727272727272 | Precision: 0.655530303030303 | Recall: 0.5272727272727272 | F1-score: 0.5417099567099567
Weights: distance | K: 11 | Accuracy: 0.5181818181818182 | Precision: 0.6442640692640693 | Recall: 0.5181818181818182 | F1-score: 0.5297979797979797
Weights: distance | K: 13 | Accuracy: 0.5363636363636364 | Precision: 0.645909090909091 | Recall: 0.5363636363636364 | F1-score: 0.5409848484848485
Weights: distance | K: 15 | Accuracy: 0.55 | Precision: 0.6678030303030303 | Recall: 0.55 | F1-score: 0.5568398268398268
Weights: distance | K: 17 | Accuracy: 0.5636363636363636 | Precision: 0.6521212121212121 | Recall: 0.5636363636363636 | F1-score: 0.5637012987012987
Weights: distance | K: 19 | Accuracy: 0.559090

Without Scraping

KNN

In [0]:
pandas_df = pdf.copy()

In [0]:
UniqueLabels = pandas_df['label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(UniqueLabels)}
pandas_df['label'] = pandas_df['label'].map(label_to_idx)

In [0]:
# Splitting the data into features (X) and labels (y)
X = np.array(pandas_df['features'].tolist())
y = np.array(pandas_df['label'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
# Initializing the K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=19, weights="distance")

# Fitting the classifier to the training data
knn_classifier.fit(X_train, y_train)

# Predicting labels for the test set
y_pred = knn_classifier.predict(X_test)

# Calculating evaluation metrics
accuracy, precision, recall, f1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')

# Printing evaluation metrics
print("Weights:", w, "| K:", k, "| Accuracy:", accuracy, "| Precision:", precision, "| Recall:", recall, "| F1-score:", f1)


Weights: distance | K: 1 | Accuracy: 0.17710077000334784 | Precision: 0.19505024174569763 | Recall: 0.17710077000334784 | F1-score: 0.1762038970665142
Weights: distance | K: 3 | Accuracy: 0.18316873116839638 | Precision: 0.19892416889085393 | Recall: 0.18316873116839638 | F1-score: 0.18087511767690456
Weights: distance | K: 5 | Accuracy: 0.1896342484097757 | Precision: 0.20063819938946664 | Recall: 0.1896342484097757 | F1-score: 0.18457918394139647
Weights: distance | K: 7 | Accuracy: 0.19354703716103114 | Precision: 0.20030099460226883 | Recall: 0.19354703716103114 | F1-score: 0.18570727568625967
Weights: distance | K: 9 | Accuracy: 0.1965391697355206 | Precision: 0.1991862988615002 | Recall: 0.1965391697355206 | F1-score: 0.18596968089839128
Weights: distance | K: 11 | Accuracy: 0.19884081017743555 | Precision: 0.19761923979821464 | Recall: 0.19884081017743555 | F1-score: 0.1854544781958786
Weights: distance | K: 13 | Accuracy: 0.2006611985269501 | Precision: 0.19585214453843852 | Re

Neural Network

In [0]:
pandas_df = pdf.copy()

In [0]:
class ComplexNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_prob=0.75):
        super(ComplexNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.act1 = nn.Tanh()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.bn2 = nn.BatchNorm1d(hidden_size2)
        self.act2 = nn.Tanh()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.dropout = nn.Dropout(dropout_prob)
        nn.Tanh

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        features = np.array(self.df.iloc[idx]['features'], dtype=np.float32)
        label = self.df.iloc[idx]['label']
        return features, label

UniqueLabels = pandas_df['label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(UniqueLabels)}
pandas_df['label'] = pandas_df['label'].map(label_to_idx)

outputLayer = len(UniqueLabels)
# Hyperparameters
input_size = 128
hidden_size1 = 512
hidden_size2 = 512
# hidden_size2 = int(round((128)*(2/3)+len(UniqueLabels)))
output_size = len(UniqueLabels)
learning_rate = 0.001
num_epochs = 75
batch_size = 64

# Split dataset into train and test sets
grouped_df = pandas_df.groupby('label')
train_data = []
test_data = []

for label, group in grouped_df:
    train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
    train_data.append(train_group)
    test_data.append(test_group)

train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create dataset and dataloader for train and test sets
train_dataset = CustomDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model_nn_noScraping = ComplexNN(input_size, hidden_size1, hidden_size2, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_nn_noScraping.parameters(), lr=learning_rate)

In [0]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Training loop
for epoch in range(num_epochs):
    t = time.time()
    # Training
    model_nn_noScraping.train()  # Set the model to train mode
    total_train_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.view(-1, input_size)
        outputs = model_nn_noScraping(inputs)

        # Convert labels from tuple to Tensor
        labels = torch.tensor(labels)

        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Testing
    model_nn_noScraping.eval()  # Set the model to evaluation mode
    total_correct_top3 = 0
    total_samples = 0
    total_test_loss = 0.0
    true_labels = []
    predicted_labels = []
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs = inputs.view(-1, input_size)
            outputs = model_nn_noScraping(inputs)
            loss = criterion(outputs, labels)

            # Convert labels from tuple to Tensor
            labels = torch.tensor(labels)

            total_test_loss += loss.item()

            _, predicted = torch.topk(outputs, k=3, dim=1)  # Get top-3 predictions
            total_correct_top3 += sum([label in pred_list for label, pred_list in zip(labels, predicted)])
            total_samples += labels.size(0)

            true_labels.extend(labels.numpy())
            predicted_labels.extend(predicted.numpy())

    avg_test_loss = total_test_loss / len(test_dataloader)
    test_top3_accuracy = total_correct_top3 / total_samples

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}, Top-3 Test Accuracy: {test_top3_accuracy:.4f}, took: {time.time() - t} seconds')

print('Finished Training')
print(f'Total Epochs: {num_epochs}')
print(f'Final Training Loss: {avg_train_loss:.4f}')
print(f'Final Test Loss: {avg_test_loss:.4f}')
print(f'Final Test Accuracy: {test_top3_accuracy:.4f}')

Epoch [1/75], Train Loss: 8.2259, Test Loss: 7.1025, Top-3 Test Accuracy: 0.1895, took: 171.44365859031677 seconds
Epoch [2/75], Train Loss: 7.1658, Test Loss: 6.4797, Top-3 Test Accuracy: 0.2450, took: 171.76789903640747 seconds
Epoch [3/75], Train Loss: 6.7542, Test Loss: 6.1575, Top-3 Test Accuracy: 0.2791, took: 170.70981407165527 seconds
Epoch [4/75], Train Loss: 6.5515, Test Loss: 5.9895, Top-3 Test Accuracy: 0.2971, took: 224.8121018409729 seconds
Epoch [5/75], Train Loss: 6.4214, Test Loss: 5.8613, Top-3 Test Accuracy: 0.3121, took: 171.80496883392334 seconds
Epoch [6/75], Train Loss: 6.3273, Test Loss: 5.7694, Top-3 Test Accuracy: 0.3216, took: 231.53384733200073 seconds
Epoch [7/75], Train Loss: 6.2577, Test Loss: 5.7005, Top-3 Test Accuracy: 0.3292, took: 171.94415187835693 seconds
Epoch [8/75], Train Loss: 6.2069, Test Loss: 5.6578, Top-3 Test Accuracy: 0.3346, took: 171.91447734832764 seconds
Epoch [9/75], Train Loss: 6.1602, Test Loss: 5.6082, Top-3 Test Accuracy: 0.3403,

In [0]:
# Save the model as a .pkl file
import torch
name = f"finalNNNoSrcraping.pkl"
# Specify the file path to save the model
model_path = f"/dbfs/FileStore/{name}"

# Save the model
torch.save(model_nn_noScraping, model_path)

With the scraping

KNN

In [0]:
pandas_df_israeli = pdf_israeli.copy()
pandas_df = pdf.copy()

In [0]:
UniqueLabels_all = pandas_df['label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(UniqueLabels_all)}
pandas_df['label'] = pandas_df['label'].map(label_to_idx)

In [0]:
UniqueLabels_israeli = pandas_df_israeli['label'].unique()
label_to_idx = {label: idx+12668 for idx, label in enumerate(UniqueLabels_israeli)}
pandas_df_israeli['label'] = pandas_df_israeli['label'].map(label_to_idx)

In [0]:
# Concatenating the two dataframes
concatenated_df = pd.concat([pandas_df, pandas_df_israeli])

In [0]:
UniqueLabels= concatenated_df['label'].unique()

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# Splitting the data into features (X) and labels (y)
X = np.array(concatenated_df['features'].tolist())
y = np.array(concatenated_df['label'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
for w in ["distance"]:
    for k in range(5, 30,2):
        # Initializing the K-Nearest Neighbors classifier
        knn_classifier = KNeighborsClassifier(n_neighbors=k, weights=w)

        # Fitting the classifier to the training data
        knn_classifier.fit(X_train, y_train)

        # Predicting labels for the test set
        y_pred = knn_classifier.predict(X_test)

        # Calculating evaluation metrics
        accuracy, precision, recall, f1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')

        # Printing evaluation metrics
        print("Weights:", w, "| K:", k, "| Accuracy:", accuracy, "| Precision:", precision, "| Recall:", recall, "| F1-score:", f1)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Weights: distance | K: 5 | Accuracy: 0.19366395201099748 | Precision: 0.2055844297674794 | Recall: 0.19366395201099748 | F1-score: 0.1889949813777644
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Weights: distance | K: 7 | Accuracy: 0.19830872091812293 | Precision: 0.2064468157542146 | Recall: 0.19830872091812293 | F1-score: 0.1909125194048667
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Weights: distance | K: 9 | Accuracy: 0.20120389077503073 | Precision: 0.20526188004892684 | Recall: 0.20120389077503073 | F1-score: 0.19120562963043342
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Weights: distance | K: 11 | Accuracy: 0.20343254670804609 | Precision: 0.20322031859701187 | Recall: 0.20343

NN

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Define your neural network architecture
class ComplexNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_prob=0.75):
        super(ComplexNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.act1 = nn.Tanh()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.bn2 = nn.BatchNorm1d(hidden_size2)
        self.act2 = nn.Tanh()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.dropout = nn.Dropout(dropout_prob)
        nn.Tanh

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        features = np.array(self.df.iloc[idx]['features'], dtype=np.float32)
        label = self.df.iloc[idx]['label']
        return features, label
    

outputLayer = len(UniqueLabels)
# Hyperparameters
input_size = 128
hidden_size1 = 512
hidden_size2 = 512
output_size = len(UniqueLabels)
learning_rate = 0.001
num_epochs = 75
batch_size = 64

# Split dataset into train and test sets
grouped_df = concatenated_df.groupby('label')
train_data = []
test_data = []

for label, group in grouped_df:
    train_group, test_group = train_test_split(group, test_size=0.2, random_state=42)
    train_data.append(train_group)
    test_data.append(test_group)

train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create dataset and dataloader for train and test sets
train_dataset = CustomDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model = ComplexNN(input_size, hidden_size1, hidden_size2, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Training loop
for epoch in range(num_epochs):
    t = time.time()
    # Training
    model.train()  # Set the model to train mode
    total_train_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.view(-1, input_size)
        outputs = model(inputs)

        # Convert labels from tuple to Tensor
        labels = torch.tensor(labels)

        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Testing
    model.eval()  # Set the model to evaluation mode
    total_correct_top3 = 0
    total_samples = 0
    total_test_loss = 0.0
    true_labels = []
    predicted_labels = []
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs = inputs.view(-1, input_size)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Convert labels from tuple to Tensor
            labels = torch.tensor(labels)

            total_test_loss += loss.item()

            _, predicted = torch.topk(outputs, k=3, dim=1)  # Get top-3 predictions
            total_correct_top3 += sum([label in pred_list for label, pred_list in zip(labels, predicted)])
            total_samples += labels.size(0)

            true_labels.extend(labels.numpy())
            predicted_labels.extend(predicted.numpy())

    avg_test_loss = total_test_loss / len(test_dataloader)
    test_top3_accuracy = total_correct_top3 / total_samples

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}, Top-3 Test Accuracy: {test_top3_accuracy:.4f}, took: {time.time() - t} seconds')

print('Finished Training')
print(f'Total Epochs: {num_epochs}')
print(f'Final Training Loss: {avg_train_loss:.4f}')
print(f'Final Test Loss: {avg_test_loss:.4f}')
print(f'Final Test Accuracy: {test_top3_accuracy:.4f}')

  labels = torch.tensor(labels)
  labels = torch.tensor(labels)
Epoch [1/75], Train Loss: 8.2385, Test Loss: 7.1234, Top-3 Test Accuracy: 0.1862, took: 202.51670932769775 seconds
Epoch [2/75], Train Loss: 7.1894, Test Loss: 6.4977, Top-3 Test Accuracy: 0.2443, took: 208.6511845588684 seconds
Epoch [3/75], Train Loss: 6.7791, Test Loss: 6.1806, Top-3 Test Accuracy: 0.2777, took: 253.9337182044983 seconds
Epoch [4/75], Train Loss: 6.5691, Test Loss: 6.0123, Top-3 Test Accuracy: 0.2942, took: 242.70818901062012 seconds
Epoch [5/75], Train Loss: 6.4376, Test Loss: 5.8900, Top-3 Test Accuracy: 0.3095, took: 237.8256380558014 seconds
Epoch [6/75], Train Loss: 6.3429, Test Loss: 5.7934, Top-3 Test Accuracy: 0.3193, took: 212.90920853614807 seconds
Epoch [7/75], Train Loss: 6.2683, Test Loss: 5.7193, Top-3 Test Accuracy: 0.3279, took: 237.38962507247925 seconds
Epoch [8/75], Train Loss: 6.2200, Test Loss: 5.6660, Top-3 Test Accuracy: 0.3343, took: 177.56411790847778 seconds
Epoch [9/75], Train

In [0]:
# Save the model as a .pkl file
import torch
name = f"finalNNWithSrcraping.pkl"
# Specify the file path to save the model
model_path = f"/dbfs/FileStore/{name}"

# Save the model
torch.save(model, model_path)

Checking the AVG accuracy of the model only on the scraped data.

In [0]:
# Find the scraped data in test file.
scrapedTestDF = test_df[test_df['label'] >= 12668].sort_values(by='label', ascending=True).copy()
scrapedTest_dataset = CustomDataset(scrapedTestDF)
scrapedTest_dataloader = DataLoader(scrapedTest_dataset, batch_size=batch_size, shuffle=False)

In [0]:
scrapedTestDF["label"].nunique()

Out[105]: 168

In [0]:
# Testing
model.eval()  # Set the model to evaluation mode
total_correct_top3 = 0
total_samples = 0
total_test_loss = 0.0
true_labels = []
predicted_labels = []
with torch.no_grad():
    for inputs, labels in scrapedTest_dataloader:
        inputs = inputs.view(-1, input_size)
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Convert labels from tuple to Tensor
        labels = torch.tensor(labels)

        total_test_loss += loss.item()

        _, predicted = torch.topk(outputs, k=3, dim=1)  # Get top-3 predictions
        total_correct_top3 += sum([label in pred_list for label, pred_list in zip(labels, predicted)])
        total_samples += labels.size(0)

        true_labels.extend(labels.numpy())
        predicted_labels.extend(predicted.numpy())

avg_test_loss = total_test_loss / len(scrapedTest_dataloader)
test_top3_accuracy = total_correct_top3 / total_samples

print(f'Test Loss: {avg_test_loss:.4f}, Top-3 Test Accuracy: {test_top3_accuracy:.4f}')

Test Loss: 4.1974, Top-3 Test Accuracy: 0.4949
  labels = torch.tensor(labels)
