In [1]:
import requests
import zipfile
import io

# DATASET STAGE

# URL of the dataset - skills assessment data
url = "https://academy.hackthebox.com/storage/modules/292/skills_assessment_data.zip"
# Download the dataset
response = requests.get(url)
if response.status_code == 200:
    print("Download successful")
else:
    print("Failed to download the dataset")

Download successful


In [2]:
# Extract the dataset - skills_assessment_data - the code will put the .zip by itself
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("skills_assessment_data")
    print("Extraction successful")

Extraction successful


In [3]:
import os

# List the extracted files from skills_assessment_data
extracted_files = os.listdir("skills_assessment_data")
print("Extracted files:", extracted_files)

Extracted files: ['test.json', 'train.json']


In [4]:
import pandas as pd

# Only Load the Train JSON files into pandas DataFrames
train_df = pd.read_json("skills_assessment_data/train.json")

# Show first few rows
print("Train dataset:")
print(train_df.head())

Train dataset:
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


In [5]:
# Print duplicate and it will drop if there's any
print("Duplicate entries for train:", train_df.duplicated().sum())
train_df = train_df.drop_duplicates()

Duplicate entries for train: 96


In [6]:
# Show first few rows to see what have we done so far
print("Train dataset:")
print(train_df.head())
print(train_df.describe())
print(train_df.info())

Train dataset:
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1
              label
count  24904.000000
mean       0.500803
std        0.500009
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
<class 'pandas.core.frame.DataFrame'>
Index: 24904 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    24904 non-null  object
 1   label   24904 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 583.7+ KB
None


In [7]:
import nltk

# PREPROCESSING STAGE

# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

print("=== BEFORE ANY PREPROCESSING ===") 
print(train_df.head(5))

=== BEFORE ANY PREPROCESSING ===
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


[nltk_data] Downloading package punkt to /home/carlomagno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/carlomagno/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/carlomagno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Convert all message text to lowercase
train_df["text"] = train_df["text"].str.lower()
print("\n=== AFTER LOWERCASING ===")
print(train_df["text"].head(5))


=== AFTER LOWERCASING ===
0    bromwell high is a cartoon comedy. it ran at t...
1    homelessness (or houselessness as george carli...
2    brilliant over-acting by lesley ann warren. be...
3    this is easily the most underrated film inn th...
4    this is not the typical mel brooks film. it wa...
Name: text, dtype: object


In [9]:
import re

# Remove non-essential punctuation and numbers, keep useful symbols like $ and !
train_df["text"] = train_df["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
print("\n=== AFTER REMOVING PUNCTUATION & NUMBERS (except $ and !) ===")
print(train_df["text"].head(5))


=== AFTER REMOVING PUNCTUATION & NUMBERS (except $ and !) ===
0    bromwell high is a cartoon comedy it ran at th...
1    homelessness or houselessness as george carlin...
2    brilliant overacting by lesley ann warren best...
3    this is easily the most underrated film inn th...
4    this is not the typical mel brooks film it was...
Name: text, dtype: object


In [10]:
from nltk.tokenize import word_tokenize

# Split each message into individual tokens
train_df["text"] = train_df["text"].apply(word_tokenize)
print("\n=== AFTER TOKENIZATION ===")
print(train_df["text"].head(5))


=== AFTER TOKENIZATION ===
0    [bromwell, high, is, a, cartoon, comedy, it, r...
1    [homelessness, or, houselessness, as, george, ...
2    [brilliant, overacting, by, lesley, ann, warre...
3    [this, is, easily, the, most, underrated, film...
4    [this, is, not, the, typical, mel, brooks, fil...
Name: text, dtype: object


In [11]:
from nltk.corpus import stopwords

# Define a set of English stop words and remove them from the tokens
stop_words = set(stopwords.words("english"))
train_df["text"] = train_df["text"].apply(lambda x: [word for word in x if word not in stop_words])
print("\n=== AFTER REMOVING STOP WORDS ===")
print(train_df["text"].head(5))


=== AFTER REMOVING STOP WORDS ===
0    [bromwell, high, cartoon, comedy, ran, time, p...
1    [homelessness, houselessness, george, carlin, ...
2    [brilliant, overacting, lesley, ann, warren, b...
3    [easily, underrated, film, inn, brooks, cannon...
4    [typical, mel, brooks, film, much, less, slaps...
Name: text, dtype: object


In [12]:
from nltk.stem import PorterStemmer

# Stem each token to reduce words to their base form ex. programming will turn into program
stemmer = PorterStemmer()
train_df["text"] = train_df["text"].apply(lambda x: [stemmer.stem(word) for word in x])
print("\n=== AFTER STEMMING ===")
print(train_df["text"].head(5))


=== AFTER STEMMING ===
0    [bromwel, high, cartoon, comedi, ran, time, pr...
1    [homeless, houseless, georg, carlin, state, is...
2    [brilliant, overact, lesley, ann, warren, best...
3    [easili, underr, film, inn, brook, cannon, sur...
4    [typic, mel, brook, film, much, less, slapstic...
Name: text, dtype: object


In [13]:
# Rejoin tokens into a single string for feature extraction
train_df["text"] = train_df["text"].apply(lambda x: " ".join(x))
print("\n=== AFTER JOINING TOKENS BACK INTO STRINGS ===")
print(train_df["text"].head(5))


=== AFTER JOINING TOKENS BACK INTO STRINGS ===
0    bromwel high cartoon comedi ran time program s...
1    homeless houseless georg carlin state issu yea...
2    brilliant overact lesley ann warren best drama...
3    easili underr film inn brook cannon sure flaw ...
4    typic mel brook film much less slapstick movi ...
Name: text, dtype: object


In [14]:
# FEATURE EXTRACTTON

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 3))

# Fit and transform the message column
X = vectorizer.fit_transform(train_df["text"])

# Labels (target variable)
y = train_df["label"] # Converting labels to 1 and 0

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Build the pipeline by combining vectorization and classification
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])

In [16]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

# Fit the grid search on the full dataset
grid_search.fit(train_df["text"], y)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)

Best model parameters: {'classifier__alpha': 1.0}


In [17]:
# Testing Evaluation

test_df = pd.read_json("skills_assessment_data/test.json", orient="records")

In [18]:
import numpy as np
import re

# Preprocess function that mirrors the training-time preprocessing
def preprocess_test_df(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s$!]", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

In [19]:
# Preprocess and vectorize messages
processed_test_df = [preprocess_test_df(msg) for msg in test_df["text"]]

In [20]:
# Transform preprocessed messages into feature vectors
X_new = best_model.named_steps["vectorizer"].transform(processed_test_df)

In [21]:
# Predict with the trained classifier
predictions = best_model.named_steps["classifier"].predict(X_new)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_new)

In [22]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(test_df["label"], predictions))
print(classification_report(test_df["label"], predictions))

# Show a few predictions with probabilities
for i in range(5):  # just first 5 examples
    msg = test_df["text"].iloc[i]
    prediction = "Positive Review" if predictions[i] == 1 else "Negative Review"
    positive_probability = prediction_probabilities[i][1]
    negative_probability = prediction_probabilities[i][0]

    print(f"\nReview: {msg[:100]}...")  # truncate long reviews
    print(f"Prediction: {prediction}")
    print(f"Positive Probability: {positive_probability:.2f}")
    print(f"Negative Probability: {negative_probability:.2f}")

Accuracy: 0.8554
              precision    recall  f1-score   support

           0       0.83      0.89      0.86     12500
           1       0.88      0.82      0.85     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000


Review: I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that...
Prediction: Positive Review
Positive Probability: 1.00
Negative Probability: 0.00

Review: Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror "Frailty", with ...
Prediction: Positive Review
Positive Probability: 1.00
Negative Probability: 0.00

Review: As a recreational golfer with some knowledge of the sport's history, I was pleased with Disney's sen...
Prediction: Positive Review
Positive Probability: 1.00
Negative Probability: 0.00

Review: I saw this film in a sneak preview, and it is delightful. The 

In [23]:
import joblib

# Save the trained model to a file for future use
model_filename = 'skills_assessment.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to skills_assessment.joblib


In [24]:
loaded_model = joblib.load(model_filename)
predictions = loaded_model.predict(test_df)