In [68]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/parsed-data1/Combined_Basit_Labels.csv
/kaggle/input/cleaned-parsed-data/cleaned_dataset.csv


In [69]:
import pandas as pd
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [70]:
# Load dataset
df = pd.read_csv("/kaggle/input/cleaned-parsed-data/cleaned_dataset.csv")  # Change to your actual dataset file

In [71]:
# Drop null values
df.dropna(inplace=True)
# Convert to lowercase
df["Category"] = df["Category"].str.lower()
df["URL"] = df["URL"].str.lower()
# Remove "www." and "https://"
df["URL"] = df["URL"].str.replace(r"(https?://|www\.)", "", regex=True)
# **Fix: Remove trailing dots from category labels**
df["Category"] = df["Category"].str.strip().str.rstrip(".")

In [72]:
# Count occurrences of each category
category_counts = df["Category"].value_counts()

# Get the top 5 categories
top_5_categories = category_counts.nlargest(5).index  # Selects the top 5 most frequent categories

# Filter the dataframe to include only URLs belonging to these top 5 categories
df = df[df["Category"].isin(top_5_categories)]

In [73]:
# Split into train (80%) and validation (20%)

train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Category"])

# Convert to FastText format (__label__category URL)
def convert_to_fasttext_format(df, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"__label__{row['Category']} {row['URL']}\n")

In [74]:
convert_to_fasttext_format(train_df, "train.txt")
convert_to_fasttext_format(valid_df, "valid.txt")

In [75]:
# Train FastText model with improved settings
model = fasttext.train_supervised(
    input="train.txt",
    lr=0.5,        # Higher learning rate
    epoch=300,     # More training iterations
    wordNgrams=4,  # More word combinations
    dim=300,       # Higher embedding size
    minCount=5,    # Ignore very rare words
    minn=3, maxn=6,  # Better subword features
    loss="softmax",
    verbose=2
)


In [76]:
# Evaluate on validation set
result = model.test("valid.txt")
print(f"Validation Accuracy: {result[1]:.4f}")

Validation Accuracy: 0.7960


In [77]:
# Generate predictions for validation set
true_labels = []
pred_labels = []

with open("valid.txt", "r", encoding="utf-8") as f:
    for line in f:
        label, url = line.split(" ", 1)
        true_labels.append(label.replace("__label__", ""))
        pred = model.predict(url.strip())[0][0].replace("__label__", "")
        pred_labels.append(pred)

# Print classification report
print("\nClassification Report:\n", classification_report(true_labels, pred_labels))


Classification Report:
                precision    recall  f1-score   support

     commerce       0.65      0.74      0.69      2503
    education       0.82      0.79      0.80      2015
entertainment       0.54      0.46      0.49      1442
   government       0.84      0.80      0.82      1456
        tools       1.00      1.00      1.00      3062

     accuracy                           0.80     10478
    macro avg       0.77      0.76      0.76     10478
 weighted avg       0.80      0.80      0.79     10478



In [None]:
#Save model
model.save_model("/kaggle/working/fasttext_url_classifier_optimized.ftz")



In [82]:
# Test with example
loaded_model = fasttext.load_model("/kaggle/working/fasttext_url_classifier_optimized.ftz")
print(loaded_model.predict("mofa.gov.pk"))  # Example prediction

(('__label__government',), array([1.00001001]))
