* Imports

In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import re

* Reading dataset

In [89]:
df = pd.read_csv("data.csv")
mes = list(df["message"])

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  500 non-null    object
 1   fingers  500 non-null    int64 
 2   tail     500 non-null    object
 3   species  500 non-null    object
dtypes: int64(1), object(3)
memory usage: 15.8+ KB


#### Vocabulary Creation

In [91]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df.iloc[:,0 : len(df.columns)-1], df["species"], test_size=0.1, random_state=42)

mes1 = X_train1['message']


Note - Here we did the train test split before as to make sure that our model's vocabulary do not contains any unseen words later on. Also we won't be using this splited dataset after this

In [92]:
# Tokenizing if needed
def tokenize(text):
    text = re.sub(r'\W+', ' ', text)
    return text.lower().split()

# Word Frequency
def get_word_frequencies(messages):
    all_words = []
    for message in messages:
        all_words.extend(tokenize(message))
    
    word_counts = Counter(all_words)
    return word_counts
word_frequencies = get_word_frequencies(mes1)


top_n = 175
most_common_words = word_frequencies.most_common(top_n)

print("Top vocabulary based on frequency:")
for word, freq in most_common_words:
    print(f"{word}: {freq}")



Top vocabulary based on frequency:
terranix: 26
nebuz: 24
vortex: 23
floraz: 23
nebulax: 23
pluvia: 23
cosmix: 22
sirenix: 22
solarix: 21
ragex: 21
virtua: 21
pollex: 21
ufox: 21
fearix: 21
lazeron: 20
cryptoz: 20
quasar: 20
faerix: 20
shamex: 20
ventus: 20
novax: 20
celestar: 20
rootix: 20
glixx: 19
quantix: 19
quantaz: 19
neuraz: 19
gryphox: 19
herba: 19
dredax: 18
orbitaz: 18
novum: 18
gaiax: 18
herox: 18
titanos: 18
astron: 18
biomar: 18
awezom: 18
joyzor: 18
elvex: 17
pulsar: 17
anxius: 17
zenox: 17
luvium: 17
leafon: 17
pridius: 17
calmox: 17
excitar: 17
shockus: 17
faunar: 17
kometa: 16
cyclopix: 16
stardux: 16
warpz: 16
hopium: 16
aeon: 16
xeno: 16
mechan: 16
codex: 16
nanobyt: 16
synapz: 16
robonix: 16
biotex: 16
petros: 16
dronix: 16
angstix: 16
blissam: 16
circux: 16
euphorix: 15
arbor: 15
galaxum: 15
pulsox: 15
nimbus: 15
ekstax: 15
empathix: 15
fenix: 15
sorrowz: 15
aviana: 14
solux: 14
unikor: 14
meteorn: 14
ekos: 14
digitron: 14
magix: 14
terram: 14
cybron: 14
datax: 14


* Limiting the vocabulary to 175 words not only considers the most frequently used words, but also generalizes for with more extra words

In [93]:
vocab = []
for word, freq in most_common_words:
    vocab.append(word)

vocab = sorted(vocab)

In [94]:
# Vocabulary
vocab

['aeon',
 'aeop',
 'algorix',
 'angstix',
 'anxius',
 'aquos',
 'arbor',
 'ariana',
 'asbron',
 'astron',
 'astrrn',
 'aviana',
 'awezom',
 'beastij',
 'beastix',
 'biomar',
 'biotex',
 'blapoz',
 'blatoz',
 'blissam',
 'calmox',
 'celestar',
 'celwstar',
 'centarex',
 'cewestar',
 'circum',
 'circux',
 'codex',
 'cosmix',
 'cryptoz',
 'cybrex',
 'cybron',
 'cyclopix',
 'datax',
 'deitax',
 'digitron',
 'disgue',
 'disgux',
 'djonix',
 'drakos',
 'dredax',
 'dronix',
 'egphorix',
 'ekos',
 'ekstax',
 'elvex',
 'elwex',
 'empathix',
 'epikoz',
 'euphorix',
 'excitar',
 'fabulon',
 'faemix',
 'faerix',
 'faoulon',
 'faunar',
 'fearix',
 'fenix',
 'flodan',
 'floran',
 'floraz',
 'floren',
 'foliar',
 'folwar',
 'furio',
 'gaiax',
 'galaxum',
 'gleex',
 'glixx',
 'gmblax',
 'goblax',
 'goblex',
 'gryphov',
 'gryphox',
 'haunar',
 'herba',
 'herox',
 'herux',
 'holox',
 'hopium',
 'hosmix',
 'insectus',
 'joyzor',
 'kkmeta',
 'kometa',
 'kometk',
 'kometo',
 'krakos',
 'lazeron',
 'leafon'

In [95]:
word_to_index = {word: idx for idx, word in enumerate(vocab)}

print("Word to Index Mapping:")
for word, index in word_to_index.items():
    print(f"{word}: {index}")

Word to Index Mapping:
aeon: 0
aeop: 1
algorix: 2
angstix: 3
anxius: 4
aquos: 5
arbor: 6
ariana: 7
asbron: 8
astron: 9
astrrn: 10
aviana: 11
awezom: 12
beastij: 13
beastix: 14
biomar: 15
biotex: 16
blapoz: 17
blatoz: 18
blissam: 19
calmox: 20
celestar: 21
celwstar: 22
centarex: 23
cewestar: 24
circum: 25
circux: 26
codex: 27
cosmix: 28
cryptoz: 29
cybrex: 30
cybron: 31
cyclopix: 32
datax: 33
deitax: 34
digitron: 35
disgue: 36
disgux: 37
djonix: 38
drakos: 39
dredax: 40
dronix: 41
egphorix: 42
ekos: 43
ekstax: 44
elvex: 45
elwex: 46
empathix: 47
epikoz: 48
euphorix: 49
excitar: 50
fabulon: 51
faemix: 52
faerix: 53
faoulon: 54
faunar: 55
fearix: 56
fenix: 57
flodan: 58
floran: 59
floraz: 60
floren: 61
foliar: 62
folwar: 63
furio: 64
gaiax: 65
galaxum: 66
gleex: 67
glixx: 68
gmblax: 69
goblax: 70
goblex: 71
gryphov: 72
gryphox: 73
haunar: 74
herba: 75
herox: 76
herux: 77
holox: 78
hopium: 79
hosmix: 80
insectus: 81
joyzor: 82
kkmeta: 83
kometa: 84
kometk: 85
kometo: 86
krakos: 87
lazeron:

#### Message Vectorisation in One Hot Encoding

In [96]:
def tokenize(text):
    text = re.sub(r'\W+', ' ', text)
    return text.lower().split()

def one_hot_encode_message(message, vocabulary, word_to_index):
    tokenized_message = tokenize(message)
    one_hot_vector = np.zeros(len(vocabulary), dtype=int)
    
    for word in tokenized_message:
        if word in word_to_index:
            index = word_to_index[word]
            one_hot_vector[index] = 1
    
    return one_hot_vector

one_hot_encoded_messages = [one_hot_encode_message(msg, vocab, word_to_index) for msg in mes]

for i, one_hot_vector in enumerate(one_hot_encoded_messages):
    print(f"Message {i+1}: {mes[i]}")
    print(f"One-Hot Encoding: {one_hot_vector}\n")


Message 1: pluvia arbor aquos
One-Hot Encoding: [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Message 2: cosmix xeno nebuz odbitaz
One-Hot Encoding: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]

Message 3: solarix glixx novum galaxum quasar
One-Hot Encoding: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [97]:
# Encoding the dataset
df_encoded = pd.DataFrame(one_hot_encoded_messages)

df_encoded.columns = [word for word in vocab ]

In [98]:
df_encoded.iloc[0, :]

aeon       0
aeop       0
algorix    0
angstix    0
anxius     0
          ..
yosmix     0
zenox      0
zenrx      0
zephyr     0
zorp       0
Name: 0, Length: 175, dtype: int32

In [99]:
# Concatinating messages with other features
df_final = pd.concat([df_encoded, df.iloc[:,1:]], axis=1)

In [100]:
# Binary Encoding
d = pd.get_dummies(df_final["tail"])
d.drop(columns=["no"],inplace=True)
df_final["tail"] = d["yes"]

In [101]:
df_final

Unnamed: 0,aeon,aeop,algorix,angstix,anxius,aquos,arbor,ariana,asbron,astron,...,xeno,yebuz,yosmix,zenox,zenrx,zephyr,zorp,fingers,tail,species
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,4,False,Aquari
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,5,True,Zorblax
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,True,Zorblax
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,True,Florian
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,False,Faerix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,False,Emotivor
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,False,Quixnar
497,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,6,True,Zorblax
498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,True,Florian


In [102]:
# Label encoding of Species
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_final['species'] = le.fit_transform(df_final['species'])

In [103]:
df_final.head()

Unnamed: 0,aeon,aeop,algorix,angstix,anxius,aquos,arbor,ariana,asbron,astron,...,xeno,yebuz,yosmix,zenox,zenrx,zephyr,zorp,fingers,tail,species
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,4,False,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,5,True,9
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,True,9
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,2,True,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,False,3


In [104]:
le.classes_

array(['Aquari', 'Cybex', 'Emotivor', 'Faerix', 'Florian', 'Mythron',
       'Nexoon', 'Quixnar', 'Sentire', 'Zorblax'], dtype=object)

In [105]:
len(df_final.columns)

178

* Final X and y

In [106]:
X = df_final.iloc[:, 0:len(df_final.columns)-1]
y = df_final.species

In [107]:
X.head()

Unnamed: 0,aeon,aeop,algorix,angstix,anxius,aquos,arbor,ariana,asbron,astron,...,warpz,xeno,yebuz,yosmix,zenox,zenrx,zephyr,zorp,fingers,tail
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,4,False
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,5,True
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,True
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,2,True
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,False


In [108]:
y.head()

0    0
1    9
2    9
3    4
4    3
Name: species, dtype: int32

### Applying Machine learning algorithms

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

* Train test split

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [111]:
X_train.iloc[0, :]

aeon          0
aeop          0
algorix       0
angstix       0
anxius        0
           ... 
zenrx         0
zephyr        0
zorp          0
fingers       2
tail       True
Name: 72, Length: 177, dtype: object

In [112]:
y_train

72     2
182    7
131    9
410    4
193    4
      ..
106    7
270    2
348    9
435    4
102    8
Name: species, Length: 450, dtype: int32

In [113]:
# Parameter Grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

# Model's parameters
print("Best Parameters:", grid_search.best_params_)
best_params = grid_search.best_params_

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [114]:
# Model
rf_classifier = RandomForestClassifier(**best_params, random_state=42)

#Train
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9

* eXtreme Gradient Boosting

In [115]:
# Parameter Grid
param_grid = {
    'n_estimators': [100, 200,],
    'max_depth': [3, 5, 8, 10],
    'learning_rate': [0.01, 0.1,],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1]
}

grid_search_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid, cv=5, n_jobs=-1, verbose=2)

# Grid search
grid_search_xgb.fit(X_train, y_train)

# Best parameters
best_params_xgb = grid_search_xgb.best_params_
print("Best Parameters from Grid Search for XGBoost:", best_params_xgb)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters from Grid Search for XGBoost: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.6}


In [116]:
# Model
xgb_best_model = XGBClassifier(**best_params_xgb, use_label_encoder=False, eval_metric='mlogloss')

# Train
xgb_best_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [117]:
# Prediction
y_pred_xgb = xgb_best_model.predict(X_test)

# Evaluation
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy with Best Params for XGBoost: {accuracy_xgb:.4f}")

# Detailed classification report (precision, recall, f1-score for each class)
print("\nClassification Report for XGBoost:\n", classification_report(y_test, y_pred_xgb))

Accuracy with Best Params for XGBoost: 0.9400

Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         6
           2       0.80      1.00      0.89         4
           3       1.00      0.90      0.95        10
           4       1.00      1.00      1.00         3
           5       0.83      1.00      0.91         5
           6       1.00      1.00      1.00         3
           7       0.75      1.00      0.86         3
           8       1.00      0.83      0.91         6
           9       1.00      0.88      0.93         8

    accuracy                           0.94        50
   macro avg       0.94      0.96      0.94        50
weighted avg       0.95      0.94      0.94        50

