In [1]:
import zipfile
import os
import pandas as pd
import numpy as np

# Extract the dataset
with zipfile.ZipFile('skills_assessment_data.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Explore the extracted files
for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.csv') or file.endswith('.txt'):
            print(os.path.join(root, file))

In [2]:
import json
import pandas as pd

# 1.1) Read train.json and test.json into DataFrames
with open('train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
with open('test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df  = pd.DataFrame(test_data)

# 1.2) Quick sanity checks
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape:     {test_df.shape}\n")

print("Columns in training data:", train_df.columns.tolist())
print("\nLabel distribution (train):")
print(train_df['label'].value_counts(), "\n")

print("Sample training row:")
print(train_df.head(2))


Training data shape: (25000, 2)
Test data shape:     (25000, 2)

Columns in training data: ['text', 'label']

Label distribution (train):
label
1    12500
0    12500
Name: count, dtype: int64 

Sample training row:
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1


In [3]:
import re

def preprocess_text(text: str) -> str:
    # 2.1) Lowercase
    text = text.lower()
    # 2.2) Strip HTML tags like "<br />"
    text = re.sub(r'<[^>]+>', ' ', text)
    # 2.3) Remove all non-letter characters, leaving only a–z and spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    # 2.4) Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 2.5) Test on a few samples from train_df and test_df
samples = [
    train_df['text'].iloc[0],
    train_df['text'].iloc[1],
    test_df['text'].iloc[0],
    test_df['text'].iloc[1]
]

print("=== Original vs. Preprocessed ===\n")
for i, raw in enumerate(samples):
    cleaned = preprocess_text(raw)
    print(f"Sample {i+1} original (first 100 chars):\n{raw[:100]}…")
    print(f"Sample {i+1} cleaned:              \n{cleaned[:100]}…\n")


=== Original vs. Preprocessed ===

Sample 1 original (first 100 chars):
Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life,…
Sample 1 cleaned:              
bromwell high is a cartoon comedy it ran at the same time as some other programs about school life s…

Sample 2 original (first 100 chars):
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan…
Sample 2 cleaned:              
homelessness or houselessness as george carlin stated has been an issue for years but never a plan t…

Sample 3 original (first 100 chars):
I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that…
Sample 3 cleaned:              
i went and saw this movie last night after being coaxed to by a few friends of mine i ll admit that …

Sample 4 original (first 100 chars):
Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror "Frailty", with …
Samp

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 3.1) Create the TF-IDF vectorizer (using our preprocess_text from Stage 2)
tfidf_vectorizer = TfidfVectorizer(
    preprocessor=preprocess_text,  # calls our function automatically
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

# 3.2) Fit on all training reviews and transform both train and portal test
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf  = tfidf_vectorizer.transform(test_df['text'])

print(f"TF-IDF shapes ➞ Train: {X_train_tfidf.shape},  Test: {X_test_tfidf.shape}")

# 3.3) Check how many test rows became “all-zero” vectors
nonzero_counts = X_test_tfidf.getnnz(axis=1)        # number of nonzero components per row
zero_fraction  = np.mean(nonzero_counts == 0)       # fraction with no nonzero terms
print(f"Fraction of test rows with zero TF-IDF features: {zero_fraction:.3f}")


TF-IDF shapes ➞ Train: (25000, 10000),  Test: (25000, 10000)
Fraction of test rows with zero TF-IDF features: 0.000


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 4.1) Create a local train/validation split (80/20)
X_text = train_df['text']
y_label = train_df['label']

X_text_train, X_text_val, y_train, y_val = train_test_split(
    X_text, y_label,
    test_size=0.20,
    random_state=42,
    stratify=y_label
)

# 4.2) Vectorize those splits
X_train_local_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_val_local_tfidf   = tfidf_vectorizer.transform(X_text_val)

# 4.3) Train logistic regression on the 80% training portion
clf = LogisticRegression(random_state=42, max_iter=1000, C=1.0, solver='liblinear')
clf.fit(X_train_local_tfidf, y_train)

# 4.4) Predict + evaluate on local 20% validation
y_val_pred = clf.predict(X_val_local_tfidf)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Local validation accuracy: {val_acc:.4f}\n")

print("Classification Report (local 20%):")
print(classification_report(y_val, y_val_pred, target_names=['negative','positive']))

print("Confusion Matrix (local 20%):")
cm = confusion_matrix(y_val, y_val_pred)
print(f"TN:{cm[0,0]}  FP:{cm[0,1]}")
print(f"FN:{cm[1,0]}  TP:{cm[1,1]}")


Local validation accuracy: 0.8754

Classification Report (local 20%):
              precision    recall  f1-score   support

    negative       0.89      0.86      0.87      2500
    positive       0.87      0.89      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Confusion Matrix (local 20%):
TN:2154  FP:346
FN:277  TP:2223


In [6]:
from sklearn.metrics import accuracy_score

# 5.1) Refit TF-IDF on entire train set (25000 rows) and retrain classifier
X_full_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
clf_full = LogisticRegression(random_state=42, max_iter=1000, C=1.0, solver='liblinear')
clf_full.fit(X_full_tfidf, train_df['label'])

# 5.2) Transform portal test set and predict
X_portal_tfidf = tfidf_vectorizer.transform(test_df['text'])
y_portal_pred  = clf_full.predict(X_portal_tfidf)
portal_acc     = accuracy_score(test_df['label'], y_portal_pred)

print(f"Accuracy on portal’s test.json (locally): {portal_acc:.4f}")


Accuracy on portal’s test.json (locally): 0.8828


In [11]:
# -------------- CELL B: build + save a purely‐standard sklearn pipeline --------------

import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(
            lowercase=True,
            # only keep tokens consisting of letters a–z
            token_pattern=r"(?u)\b[a-z]+\b",
            max_features=10000,
            stop_words="english",
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        )
    ),
    (
        "clf",
        LogisticRegression(
            random_state=42,
            max_iter=1000,
            solver="liblinear"
        )
    )
])

# Fit on the RAW "text" column (no separate cleaning step!)
pipeline.fit(train_df["text"], train_df["label"])

# Overwrite the old joblib file:
joblib.dump(pipeline, "skills_assessment.joblib")
print("✅ pipeline saved to skills_assessment.joblib")


✅ pipeline saved to skills_assessment.joblib


In [12]:
# -------------- CELL C: sanity_check_loaded_model --------------
import joblib

# 1) Load back the file you just created
loaded_pipeline = joblib.load("skills_assessment.joblib")

# 2) Try a couple of dummy predictions on RAW text:
examples = [
    "I loved this movie, it was fantastic!",
    "Worst film ever. I hated every minute.",
]

# **NO ADDITIONAL CLEANING** — pass the raw strings directly:
preds = loaded_pipeline.predict(examples)
print("Example texts → predicted labels:", list(zip(examples, preds)))

# (Optionally, check .predict_proba or a quick local test on test_df):
from sklearn.metrics import accuracy_score
y_test_pred = loaded_pipeline.predict(test_df["text"])
print("Local test accuracy:", accuracy_score(test_df["label"], y_test_pred))


Example texts → predicted labels: [('I loved this movie, it was fantastic!', np.int64(1)), ('Worst film ever. I hated every minute.', np.int64(0))]
Local test accuracy: 0.88112


In [13]:
# -------------- CELL D: upload to HTB endpoint --------------
import requests
import json

url = "http://10.129.146.230:5000/api/upload"
model_file_path = "skills_assessment.joblib"

print("🚀 Uploading model to evaluation portal...")
with open(model_file_path, "rb") as model_file:
    response = requests.post(url, files={"model": model_file})

print("Status code:", response.status_code)
print(json.dumps(response.json(), indent=4))


🚀 Uploading model to evaluation portal...
Status code: 200
{
    "accuracy": 0.0,
    "metrics": null,
    "misclassified": []
}
