In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [39]:
df = pd.read_csv('/kaggle/input/innov8-2-0-dataset/data.csv')

In [40]:
df.head(10)

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,no,Aquari
1,cosmix xeno nebuz odbitaz,5,yes,Zorblax
2,solarix glixx novum galaxum quasar,5,yes,Zorblax
3,arbor insectus pesros ekos dootix nimbus,2,yes,Florian
4,mermax drakos lorix epikoz deftax,4,no,Faerix
5,synapz algorix cryptoz digitvon algorix quanti...,3,no,Nexoon
6,datax quantaz cryptoz cybron quantix nanozom c...,6,yes,Nexoon
7,legax fenix herox sagaq,5,yes,Mythron
8,dredax ragex empathix anxius,2,no,Emotivor
9,joyzor ragex sadix angxtix dredax ekstax shame...,4,yes,Sentire


In [41]:
print(set(df['species']))
print(set(df['fingers']))

{'Florian', 'Nexoon', 'Zorblax', 'Emotivor', 'Sentire', 'Aquari', 'Quixnar', 'Cybex', 'Faerix', 'Mythron'}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [42]:
tail = {
    'yes':1,
    'no':0
}

species = {
    'Emotivor' : 0,
    'Faerix' : 1,
    'Zorblax' : 2,
    'Florian' : 3,
    'Aquari':4,
    'Mythron':5,
    'Sentire':6,
    'Cybex' : 7,
    'Quixnar':8,
    'Nexoon':9,


}


label = set(df['species'])

df['tail'] = df['tail'].map(tail)
df['species'] = df['species'].map(species)
id2label = {id:label for id,label in enumerate(label)}
df.head()

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,0,4
1,cosmix xeno nebuz odbitaz,5,1,2
2,solarix glixx novum galaxum quasar,5,1,2
3,arbor insectus pesros ekos dootix nimbus,2,1,3
4,mermax drakos lorix epikoz deftax,4,0,1


In [43]:
df['message'].head(20)

0                                    pluvia arbor aquos
1                             cosmix xeno nebuz odbitaz
2                    solarix glixx novum galaxum quasar
3              arbor insectus pesros ekos dootix nimbus
4                     mermax drakos lorix epikoz deftax
5     synapz algorix cryptoz digitvon algorix quanti...
6     datax quantaz cryptoz cybron quantix nanozom c...
7                               legax fenix herox sagaq
8                          dredax ragex empathix anxius
9     joyzor ragex sadix angxtix dredax ekstax shame...
10    anxius ragex shockus furio zenox empathix dred...
11                   gleex empathix calmox luvium furio
12    nanozom crypooz aeon lazeron techix codex tkch...
13                          zephyr nimbus faunar foliar
14                circux lazeron neuraz quaztaz cryptoz
15             deitax lorix goblax epikoz fenix titanos
16                         ventus seepod fsunar sporzom
17                          gaiax arbor faunar i

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

label_encoder = LabelEncoder()
df['tail_encoded'] = label_encoder.fit_transform(df['tail'])
df['species_encoded'] = label_encoder.fit_transform(df['species'])


vectorizer = TfidfVectorizer()
X_message = vectorizer.fit_transform(df['message'])


X = np.hstack((X_message.toarray(), df[['fingers', 'tail_encoded']].values))
y = df['species_encoded']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Accuracy: 0.82


In [45]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier 
from sklearn.decomposition import PCA


pca = PCA(n_components=200)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

svm = SVC(probability=True)
xgb = XGBClassifier()
gb = GradientBoostingClassifier()
rf = RandomForestClassifier(class_weight='balanced')
voting_clf = VotingClassifier(estimators=[('xgb',xgb),('rf',rf),('svm', svm), ('gb', gb)], voting='soft')


voting_clf.fit(X_train_pca, y_train)


y_pred = voting_clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.89


In [46]:
test = pd.read_csv('/kaggle/input/innov8-2-0-dataset/test.csv')
test.head(20)

Unnamed: 0,message,fingers,tail
0,iephyr terram nimbus terram faunar foliar,2,no
1,joyzor uleex luvium caloox shockus blissae,4,yes
2,aquos arbor ventuc,4,yes
3,nympha nympha epikoz nympha mythox mythox mythox,3,no
4,diitax sibenix fabulon,4,yes
5,legax herox elvex cyflopix cyclopix,5,yes
6,mythox lorix lorix fabulon deibax,4,no
7,gobnax elvex gryphox elvex cyclopix relikum el...,3,no
8,tristix anxius shockus,1,no
9,faunar floran faunar foliar biomar florzn petr...,2,yes


In [47]:
tail_ = {
    0:'no',
    1:'yes'
}

In [48]:

test['tail'] = test['tail'].map(tail)


test['tail_encoded'] = label_encoder.fit_transform(test['tail'])


X_test_message = vectorizer.transform(test['message'])


X_test = np.hstack((X_test_message.toarray(), test[['fingers', 'tail_encoded']].values))

X_test_ = pca.fit_transform(X_test)


y_test_pred = voting_clf.predict(X_test_)

print(y_test_pred)

test['species'] = [id2label[label] for label in y_test_pred]
test['tail'] = [tail_[t] for t in test['tail']]
test = test.drop(['tail_encoded'],axis=1)
test.to_csv('/kaggle/working/species.csv', index=False)
test.head()

[0 4 8 7 9 9 9 7 3 0 9 3 8 6 9 6 8 0 0 2 2 3 6 6 5 6 6 4 3 5 0 3 9 3 9 3 6
 8 6 6 3 6 9 6 2 4 8 6 6 8 4 2 4 8 3 4 4 3 9 6 0 0 7 3 4 9 4 9 6 6 6 3 4 2
 3 4 0 9 0 3 3 0 3 8 6 4 9 2 0 9 4 7 6 6 4 8 9 6 3 7 0 4 4 6 1 7 8 4 6 3 4
 9 6 9 6 9 9 0 9 6 9 4 0 0 2 8 0 0 6 4 0 8 9 6 4 3 0 4 6 9 0 3 2 4 2 8 3 9
 2 6 8 1 6 3 6 4 6 3 6 4 4 2 0 0 6 4 6 0 9 0 7 4 2 3 0 3 8 4 8 3 3 8 8 9 9
 2 3 0 1 6 4 2 9 8 0 0 6 7 0 6 0 7 3 8 9 6 7 2 6 9 9 9 2 0 0 3 8 9 2 6 6 3
 6 3 6 4 4 9 6 0 9 6 6 7 7 7 0 0 7 8 8 1 9 4 2 6 6 3 0 6 2 0 8 2 3 6 0 9 0
 8 9 9 2 9 8 1 1 0 6 2 0 9 3 9 9 6 6 0 0 0 4 9 4 0 3 0 0 0 8 0 9 3 3 0 9 6
 7 9 3]


Unnamed: 0,message,fingers,tail,species
0,iephyr terram nimbus terram faunar foliar,2,no,Florian
1,joyzor uleex luvium caloox shockus blissae,4,yes,Sentire
2,aquos arbor ventuc,4,yes,Faerix
3,nympha nympha epikoz nympha mythox mythox mythox,3,no,Cybex
4,diitax sibenix fabulon,4,yes,Mythron
