## Importing Libraries for Data Cleaning Process

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

## Data Reading

In [2]:
#Read Data
data=pd.read_csv("website_classification.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


## Data Preprocessing

In [4]:
## Information about data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            1408 non-null   int64 
 1   website_url           1408 non-null   object
 2   cleaned_website_text  1408 non-null   object
 3   Category              1408 non-null   object
dtypes: int64(1), object(3)
memory usage: 44.1+ KB


In [5]:
## Drooping unvanted column in data
data=data.drop(["Unnamed: 0","website_url"],axis=1)

In [6]:
data.head()

Unnamed: 0,cleaned_website_text,Category
0,official site good hotel accommodation big sav...,Travel
1,expedia hotel book sites like use vacation wor...,Travel
2,tripadvisor hotel book sites like previously d...,Travel
3,cheap flights search compare flights momondo f...,Travel
4,bot create free account create free account si...,Travel


In [7]:
## Remove duplicate values in data
data=data.drop_duplicates()

In [8]:
## Set Feature and Target Values 
feature=data['cleaned_website_text']
target=data['Category']

In [9]:
print("feature:", feature.shape)
print("target:", target.shape)

feature: (1375,)
target: (1375,)


In [10]:
## Cleaning the feature column
def cleaning_processing(sentence):
    stop_words = set(stopwords.words('english'))
    stemmer = WordNetLemmatizer()
    cleaned_sentence = []

    words = word_tokenize(sentence)  # Split the sentence into words

    for word in words:
        word = word.lower()  # Convert to lowercase
        word = re.sub(r"[^a-zA-Z]", "", word)  # Remove special characters, numbers, punctuation, HTML tags

        if word not in stop_words and word != '':  # Remove empty string and remove stop words
            word = stemmer.lemmatize(word)  # Lemmatization
            cleaned_sentence.append(word)

    filtered_sentence = " ".join(cleaned_sentence)  # Convert the list of words into a string

    return filtered_sentence

In [11]:
#feature["cleaned_website_text"]=feature.apply(cleaning_processing)
feature = feature.apply(cleaning_processing)

In [12]:
print("feature:", feature.shape)
print("target:", target.shape)

feature: (1375,)
target: (1375,)


## Split the data to train & test

In [13]:
x_train,x_test,y_train,y_test=train_test_split(feature,target,test_size=0.2, random_state=42)

In [14]:
print("X_train shape:", x_train.shape)
print("X_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1100,)
X_test shape: (275,)
y_train shape: (1100,)
y_test shape: (275,)


## Transform the feature into vector using Bag of Words

In [15]:
vectorizer = TfidfVectorizer(binary=True)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [16]:
print("X_train shape:", x_train_tfidf.shape)
print("X_test shape:", x_test_tfidf.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1100, 46150)
X_test shape: (275, 46150)
y_train shape: (1100,)
y_test shape: (275,)


## Importing the model library to perform classification

In [17]:
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

###  KNeighborsClassifier()

In [18]:
classifier1=KNeighborsClassifier(n_neighbors=3)
classifier1.fit(x_train_tfidf,y_train)
predict1=classifier1.predict(x_test_tfidf)
accuracy1 = accuracy_score(y_test, predict1)

###  DecisionTreeClassifier()

In [19]:
classifier2=DecisionTreeClassifier()
classifier2.fit(x_train_tfidf,y_train)
predict2=classifier2.predict(x_test_tfidf)
accuracy2 = accuracy_score(y_test, predict2)

###  MultinomialNB()

In [20]:
classifier3=MultinomialNB()
classifier3.fit(x_train_tfidf,y_train)
predict3=classifier3.predict(x_test_tfidf)
accuracy3 = accuracy_score(y_test, predict3)

###  SVC()

In [21]:
classifier4=SVC()
classifier4.fit(x_train_tfidf,y_train)
predict4=classifier4.predict(x_test_tfidf)
accuracy4 = accuracy_score(y_test, predict4)

## Accuracy score

In [22]:
print("KNeighborsClassifier accuracy score: ",accuracy1)
print("DecisionTreeClassifier accuracy score: ",accuracy2)
print("MultinomialNB accuracy score: ",accuracy3)
print("SVC accuracy score: ",accuracy4)

KNeighborsClassifier accuracy score:  0.8072727272727273
DecisionTreeClassifier accuracy score:  0.5709090909090909
MultinomialNB accuracy score:  0.7818181818181819
SVC accuracy score:  0.8545454545454545


##  Confusion_matrix

In [23]:
print("KNeighborsClassifier")
print("KNeighborsClassifier accuracy score: ",accuracy1)
print(confusion_matrix(y_test, predict1))

KNeighborsClassifier
KNeighborsClassifier accuracy score:  0.8072727272727273
[[ 3  0  1  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0 13  1  3  1  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  3  7  1  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0 22  0  1  0  1  0  0  0  0  0  0  0  0]
 [ 0  3  4  0 14  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 26  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  1  0  0  1  0]
 [ 0  0  1  0  2  0  0 12  0  0  0  0  0  0  1  1]
 [ 0  0  0  0  2  0  0  0 13  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0 21  0  0  1  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  1 16  0  1  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  1 14  0  0  0  0]
 [ 0  0  1  0  0  1  1  0  0  0  0  0  8  0  1  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0 20  1  0]
 [ 0  0  1  0  0  0  0  0  0  0  0  0  0  0 15  0]
 [ 0  0  0  1  0  0  0  0  0  0  1  0  1  0  0 18]]


In [24]:
print("DecisionTreeClassifier")
print("DecisionTreeClassifier accuracy score: ",accuracy2)
print(confusion_matrix(y_test, predict2))

DecisionTreeClassifier
DecisionTreeClassifier accuracy score:  0.5709090909090909
[[ 0  0  0  0  0  0  0  1  0  0  1  1  0  0  1  1]
 [ 0  8  1  2  1  2  0  0  1  3  0  0  1  0  0  0]
 [ 0  1  6  0  0  1  0  4  0  0  0  0  0  0  1  0]
 [ 0  1  1 17  1  1  0  1  1  0  1  1  0  0  0  1]
 [ 0  0  2  1 12  0  0  0  0  1  2  0  0  0  2  1]
 [ 0  1  0  4  0 15  1  2  0  0  0  2  1  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  1  0  0  1  0]
 [ 1  0  0  0  0  1  0 13  0  0  1  0  0  1  0  0]
 [ 0  1  0  0  2  2  0  0  7  2  1  0  0  0  0  0]
 [ 0  2  1  2  2  0  0  0  1 16  0  0  0  0  0  0]
 [ 0  1  0  1  2  1  1  1  0  0  9  0  0  1  1  1]
 [ 0  0  0  1  0  1  0  0  0  0  0 12  0  0  2  0]
 [ 0  0  2  0  0  0  1  3  0  0  0  0  6  0  0  0]
 [ 0  0  0  0  0  3  0  3  0  1  1  0  0 14  0  0]
 [ 0  0  0  0  1  1  0  1  0  0  0  0  2  1 10  0]
 [ 0  1  0  0  1  4  0  1  0  0  2  0  0  0  0 12]]


In [25]:
print("MultinomialNB")
print("MultinomialNB accuracy score: ",accuracy3)
print(confusion_matrix(y_test, predict3))

MultinomialNB
MultinomialNB accuracy score:  0.7818181818181819
[[ 0  0  0  0  0  0  0  0  0  0  4  1  0  0  0  0]
 [ 0 15  0  1  2  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  3  6  0  4  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0 20  1  2  0  0  0  0  0  0  0  0  0  1]
 [ 0  1  0  0 20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0 22  0  0  0  0  0  0  0  0  0  3]
 [ 0  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  5  0  0  8  0  0  2  0  0  0  0  2]
 [ 0  0  0  0  2  0  0  0 13  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0 21  1  0  0  0  1  0]
 [ 0  0  0  0  1  0  0  0  0  0 17  0  0  1  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  1 12  0  0  1  1]
 [ 0  2  0  0  0  1  0  0  0  0  0  0  7  0  2  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0 20  1  0]
 [ 0  0  1  0  0  0  0  0  0  0  1  0  0  0 14  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0 20]]


In [26]:
print("SVC")
print("SVC accuracy score: ",accuracy4)
print(confusion_matrix(y_test, predict4))

SVC
SVC accuracy score:  0.8545454545454545
[[ 0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0]
 [ 0 17  0  1  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  1 10  0  1  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  2  0 21  0  2  0  1  0  0  0  0  0  0  0  0]
 [ 0  1  0  0 20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0 23  0  1  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  1  0  0 16  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0 14  0  0  0  0  0  0  0]
 [ 0  0  1  0  1  0  0  0  0 22  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  1  0  0 16  0  0  1  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  1 14  0  0  0  0]
 [ 0  2  1  0  0  0  0  1  0  0  0  0  8  0  0  0]
 [ 0  1  1  0  0  0  0  0  0  0  0  0  0 20  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  1  0  0  0 14  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0 20]]


##  MultinomialNB model have high accuracy compared to other models, so selecting MultinomialNB is the best choice for this classification problem.