# Automated Website categorization using machine learning algorithms

This notebook processes the website data and builds an ML model to predict the category of the website.

Verizon, Group 41
<br>Athena Bai, Tia Zheng, Kathy Yang, Tapuwa Kabaira, Chris Smith

Last updated: Nov. 2, 2024

## 1. Prepare data

In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LinearRegression

from sklearn.metrics import accuracy_score
#from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read labels
cat_urls = pd.read_csv("categorizedurls.csv", header=0)

In [3]:
# Read features
url_ending_df = pd.read_csv("output_with_url_endings.csv", header=0)
sentiment_df = pd.read_csv("output_with_sentiment.csv", header=0)

In [4]:
# Combine the urls, features and labels into one csv
tmp1 = cat_urls.iloc[:, 0]
tmp2 = sentiment_df.iloc[:, [2, 3]] # Columns: Sentiment Score and Sentiment Magnitude 
tmp3 = url_ending_df.iloc[:, 0]
tmp4 = cat_urls.iloc[:, 1]

df = pd.concat([tmp1, tmp2, tmp3, tmp4], axis=1)

# Save the combined data
df.to_csv('combined_data.csv', index=False)

## 2. Preprocessing

In [5]:
print("Number of rows and columns:", df.shape)

Number of rows and columns: (1000, 5)


In [6]:
# Identify the features
features = list(df.columns)
features.remove('url')
features.remove('category')
features

['Sentiment Score', 'Sentiment Magnitude', 'url_ending']

In [7]:
len(features)

3

### Missing data

In [8]:
# Check null data
df.isnull().sum()

url                      0
Sentiment Score        993
Sentiment Magnitude    993
url_ending               0
category                 0
dtype: int64

In [9]:
# Possible: mean/median imputation, mode imputation, KNN imputation, regression imputation

In [10]:
# Example of mean imputation

In [11]:
# Find the numeric columns
numeric_df = df.select_dtypes(include=[np.number])

In [12]:
numeric_df_imputed = numeric_df.fillna(numeric_df.mean())

In [13]:
numeric_col_list = list(numeric_df.columns)

In [14]:
# Substitute the original numeric columns with the imputed columns
df[numeric_col_list] = numeric_df_imputed

In [15]:
df.head(10)

Unnamed: 0,url,Sentiment Score,Sentiment Magnitude,url_ending,category
0,google.com,0.6,1.3,com,Search Engines
1,googleapis.com,-0.5,3.5,com,Content Delivery Networks
2,apple.com,0.1,11.4,com,Computer and Internet Info
3,icloud.com,0.171429,3.057143,com,Online Storage and Backup
4,facebook.com,-0.2,0.2,com,Social Networking
5,youtube.com,0.5,0.5,com,Streaming Media
6,googletagservices.com,0.171429,3.057143,com,Web Advertisements
7,amazon.com,0.4,4.2,com,Shopping
8,sc-static.net,0.171429,3.057143,net,Content Delivery Networks
9,t.co,0.3,0.3,co,Internet Communications and Telephony


In [16]:
# Uncomment the following line to inspect the processed data
# numeric_df_imputed.head(20)

Comment:
This might not yield good predictions as the missing data are too much.

In [17]:
### One-hot Encoding

In [18]:
# Find all features of type object
to_encode = list(df.select_dtypes(include=['object']).columns)
to_encode.remove('url')
to_encode.remove('category')

In [19]:
# Look at the number of unique values each column has: (from lab3)
df[to_encode].nunique()

url_ending    18
dtype: int64

In [20]:
# Only encode some of the most frequent url endings.
# Otherwise one-hot encoding will slow down the computation.
top_10_ending = list(df['url_ending'].value_counts().head(10).index)
top_10_ending

['com', 'org', 'gov', 'net', 'fm', 'tv', 'us', 'edu', 'co', 'ly']

In [21]:
for value in top_10_ending:
    df['url_ending_'+ value] = np.where(df['url_ending']==value,1,0)
    
# Remove the original column from the df
df.drop(columns = 'url_ending', inplace=True)

In [22]:
df.head(10)

Unnamed: 0,url,Sentiment Score,Sentiment Magnitude,category,url_ending_com,url_ending_org,url_ending_gov,url_ending_net,url_ending_fm,url_ending_tv,url_ending_us,url_ending_edu,url_ending_co,url_ending_ly
0,google.com,0.6,1.3,Search Engines,1,0,0,0,0,0,0,0,0,0
1,googleapis.com,-0.5,3.5,Content Delivery Networks,1,0,0,0,0,0,0,0,0,0
2,apple.com,0.1,11.4,Computer and Internet Info,1,0,0,0,0,0,0,0,0,0
3,icloud.com,0.171429,3.057143,Online Storage and Backup,1,0,0,0,0,0,0,0,0,0
4,facebook.com,-0.2,0.2,Social Networking,1,0,0,0,0,0,0,0,0,0
5,youtube.com,0.5,0.5,Streaming Media,1,0,0,0,0,0,0,0,0,0
6,googletagservices.com,0.171429,3.057143,Web Advertisements,1,0,0,0,0,0,0,0,0,0
7,amazon.com,0.4,4.2,Shopping,1,0,0,0,0,0,0,0,0,0
8,sc-static.net,0.171429,3.057143,Content Delivery Networks,0,0,0,1,0,0,0,0,0,0
9,t.co,0.3,0.3,Internet Communications and Telephony,0,0,0,0,0,0,0,0,1,0


## 3. Modeling

In [23]:
X = df.drop(columns=['url', 'category'])
y = df['category']

In [24]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

In [25]:
# Alternatives (considering that there are >80 output classes):
# Neural networks

### Train two Decision Tree Classifiers

In [26]:
# We have assigned a default value of 'entropy' to the crit parameter.
# scikit-learn's default value for min_samples_leaf is 1.
def train_test_DT(X_train, X_test, y_train, y_test, depth, crit='entropy'):
    
    # Train a Decision Tree classifier on the training data
    model = DecisionTreeClassifier(max_depth = depth, criterion = crit)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    class_label_predictions = model.predict(X_test)
    
    # Compute the accuracy
    acc_score = accuracy_score(y_test, class_label_predictions)
    
    return acc_score

In [27]:
depth1= 8
depth2 = 64

max_depth_range = [depth1, depth2]
acc = []

for md in max_depth_range:
    score = train_test_DT(X_train, X_test, y_train, y_test, md)
    acc.append(float(score))
    print('Max Depth=' + str(md) + ', accuracy score: ' + str(score))

Max Depth=8, accuracy score: 0.2425
Max Depth=64, accuracy score: 0.2425


## 4.Evaluation

In [28]:
# Possible metrics: accuracy, precision, recall, F1, ROC AUC

In [29]:
df.head(10)

Unnamed: 0,url,Sentiment Score,Sentiment Magnitude,category,url_ending_com,url_ending_org,url_ending_gov,url_ending_net,url_ending_fm,url_ending_tv,url_ending_us,url_ending_edu,url_ending_co,url_ending_ly
0,google.com,0.6,1.3,Search Engines,1,0,0,0,0,0,0,0,0,0
1,googleapis.com,-0.5,3.5,Content Delivery Networks,1,0,0,0,0,0,0,0,0,0
2,apple.com,0.1,11.4,Computer and Internet Info,1,0,0,0,0,0,0,0,0,0
3,icloud.com,0.171429,3.057143,Online Storage and Backup,1,0,0,0,0,0,0,0,0,0
4,facebook.com,-0.2,0.2,Social Networking,1,0,0,0,0,0,0,0,0,0
5,youtube.com,0.5,0.5,Streaming Media,1,0,0,0,0,0,0,0,0,0
6,googletagservices.com,0.171429,3.057143,Web Advertisements,1,0,0,0,0,0,0,0,0,0
7,amazon.com,0.4,4.2,Shopping,1,0,0,0,0,0,0,0,0,0
8,sc-static.net,0.171429,3.057143,Content Delivery Networks,0,0,0,1,0,0,0,0,0,0
9,t.co,0.3,0.3,Internet Communications and Telephony,0,0,0,0,0,0,0,0,1,0
