# Automated Website categorization using machine learning algorithms

This notebook processes the website data and builds an ML model to predict the category of the website.

Verizon, Group 41
Athena Bai, Tia Zheng, Kathy Yang, Tapuwa Kabaira, Chris Smith

Nov. 1, 2024

## 1. Prepare data

In [37]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LinearRegression

from sklearn.metrics import accuracy_score
#from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Read labels
cat_urls = pd.read_csv("categorizedurls.csv", header=0)

In [4]:
# Read features
url_ending_df = pd.read_csv("output_with_url_endings.csv", header=0)
sentiment_df = pd.read_csv("output_with_sentiment.csv", header=0)

In [8]:
# Combine the urls, features and labels into one csv
tmp1 = cat_urls.iloc[:, 0]
tmp2 = sentiment_df.iloc[:, [2, 3]] # Columns: Sentiment Score and Sentiment Magnitude 
tmp3 = url_ending_df.iloc[:, 0]
tmp4 = cat_urls.iloc[:, 1]

df = pd.concat([tmp1, tmp2, tmp3, tmp4], axis=1)

# Save the combined data
df.to_csv('data.csv', index=False)

## 2. Preprocessing

In [25]:
df.columns

Index(['url', 'Sentiment Score', 'Sentiment Magnitude', 'url_ending',
       'category'],
      dtype='object')

In [27]:
features = list(df.columns)
features.remove('url')
features.remove('category')

### Missing data

In [18]:
# Check null data
df.isnull().sum()

url                      0
Sentiment Score        993
Sentiment Magnitude    993
url_ending               0
category                 0
dtype: int64

In [9]:
# Possible: mean/median imputation, mode imputation, KNN imputation, regression imputation

In [10]:
# Example of mean imputation

In [13]:
numeric_df = df.select_dtypes(include=[np.number])
numeric_df_imputed = numeric_df.fillna(numeric_df.mean())

In [15]:
# Uncomment the following line to inspect the processed data
# numeric_df_imputed.head(20)

Comment:
This might not yield good predictions as the missing data are too much.

In [None]:
### One-hot Encoding

In [22]:
# Find all columns of type object
to_encode = list(df.select_dtypes(include=['object']).columns)

In [23]:
# Look at the number of unique values each column has: (from lab3)
df[to_encode].nunique()

url           1000
url_ending      18
category        50
dtype: int64

In [41]:
# Only encode some of the most frequent url endings.
# Otherwise one-hot encoding will slow down the computation.
top_10_ending = list(df['url_ending'].value_counts().head(10).index)
top_10_ending

['com', 'org', 'gov', 'net', 'fm', 'tv', 'us', 'edu', 'co', 'ly']

In [42]:
for value in top_10_ending:
    df['url_ending'+ value] = np.where(df['url_ending']==value,1,0)
    
# Remove the original column from the df
df.drop(columns = 'url_ending', inplace=True)

## 3. Preparation for modeling

In [43]:
X = df.drop(columns=['category', 'url'])
y = df['category']

In [44]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

## 4. Modeling

### Train two Decision Tree Classifiers

In [45]:
# We have assigned a default value of 'entropy' to the crit parameter.
# scikit-learn's default value for min_samples_leaf is 1.
def train_test_DT(X_train, X_test, y_train, y_test, depth, crit='entropy'):
    
    # Train a Decision Tree classifier on the training data
    model = DecisionTreeClassifier(max_depth = depth, criterion = crit)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    class_label_predictions = model.predict(X_test)
    
    # Compute the accuracy
    acc_score = accuracy_score(y_test, class_label_predictions)
    
    return acc_score

In [47]:
depth1= 8
depth2 = 64

max_depth_range = [depth1, depth2]
acc = []

for md in max_depth_range:
    score = train_test_DT(X_train, X_test, y_train, y_test, md)
    acc.append(float(score))
    print('Max Depth=' + str(md) + ', accuracy score: ' + str(score))

Max Depth=8, accuracy score: 0.24
Max Depth=64, accuracy score: 0.24
