In [8]:
def plot_embedding_3d(X, y, title=None):
    import plotly.express as px 
    #--------------------------------------------------------------------------#
    # This section is not mandatory as its purpose is to sort the data by label
    # so, we can maintain consistent colors for digits across multiple graphs

    # Concatenate X and y arrays
    arr_concat=np.concatenate((X, y.reshape(y.shape[0],1)), axis=1)
    # Create a Pandas dataframe using the above array
    df=pd.DataFrame(arr_concat, columns=['x', 'y', 'z', 'label'])
    # Convert label data type from float to integer
    df['label'] = df['label'].astype(int)
    # Finally, sort the dataframe by label
    df.sort_values(by='label', axis=0, ascending=True, inplace=True)
    #--------------------------------------------------------------------------#

    # Create a 3D graph
    fig = px.scatter_3d(df, x='x', y='y', z='z', color=df['label'].astype(str), height=900, width=950)

    # Update chart looks
    fig.update_layout(title_text=title,
    showlegend=True,
    legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5),
    scene_camera=dict(up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=-0.1),
    eye=dict(x=1.5, y=-1.4, z=0.5)),
    margin=dict(l=0, r=0, b=0, t=0),
    scene = dict(xaxis=dict(backgroundcolor='white',
    color='black',
    gridcolor='#f0f0f0',
    title_font=dict(size=10),
    tickfont=dict(size=10),
    ),
    yaxis=dict(backgroundcolor='white',
    color='black',
    gridcolor='#f0f0f0',
    title_font=dict(size=10),
    tickfont=dict(size=10),
    ),
    zaxis=dict(backgroundcolor='lightgrey',
    color='black',
    gridcolor='#f0f0f0',
    title_font=dict(size=10),
    tickfont=dict(size=10),
    )))
    # Update marker size
    fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))
    fig.show()

def plot_embedding_2d(X, y, title=None):      
    x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)    
    X = (X-x_min) / (x_max - x_min) 
    fig = plt.figure()    
    ax = fig.add_subplot(1,1,1)
    for i in range(X.shape[0]):
        ax.text(
        X[i,0],X[i,1],            
        str(y[i]),            
        color = plt.cm.Set1(y[i]/10),            
        fontdict = {'weight':'bold', 'size':9}
        )
    if title is not None:
        plt.title(title)
        plt.show()

In [9]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

aug = pd.read_csv('/kaggle/input/data-after-umap/augmented_umap_data.csv')
ori = pd.read_csv('/kaggle/input/data-after-umap/original_umap_data.csv')

new_train = pd.read_csv('/kaggle/input/new-augmented/augmented_umap_train_data.csv')
new_test = pd.read_csv('/kaggle/input/new-augmented/augmented_umap_test_data.csv')

new_ori_train = pd.read_csv('/kaggle/input/new-original/original_umap_train_data.csv')
new_ori_test = pd.read_csv('/kaggle/input/new-original/original_umap_test_data.csv')

In [10]:
X_aug = aug.iloc[:, :10]
y_aug = aug.iloc[:, 10]
X_ori = ori.iloc[:, :10]
y_ori = ori.iloc[:, 10]

X_new_train = new_train.iloc[:, :10]
y_new_train = new_train.iloc[:, 10]
X_new_test = new_test.iloc[:, :10]
y_new_test = new_test.iloc[:, 10]

X_new_ori_train = new_ori_train.iloc[:, :10]
y_new_ori_train = new_ori_train.iloc[:, 10]
X_new_ori_test = new_ori_test.iloc[:, :10]
y_new_ori_test = new_ori_test.iloc[:, 10]

X_aug.to_numpy().reshape((X_aug.shape[0], 10))
y_aug = np.ravel(y_aug.to_numpy().reshape((y_aug.shape[0], 1)))
X_ori.to_numpy().reshape((X_ori.shape[0], 10))
y_ori = np.ravel(y_ori.to_numpy().reshape((y_ori.shape[0], 1)))

X_new_train.to_numpy().reshape((X_new_train.shape[0], 10))
y_new_train = np.ravel(y_new_train.to_numpy().reshape((y_new_train.shape[0], 1)))
X_new_test.to_numpy().reshape((X_new_test.shape[0], 10))
y_new_test = np.ravel(y_new_test.to_numpy().reshape((y_new_test.shape[0], 1)))

X_new_ori_train.to_numpy().reshape((X_new_ori_train.shape[0], 10))
y_new_ori_train = np.ravel(y_new_ori_train.to_numpy().reshape((y_new_ori_train.shape[0], 1)))
X_new_ori_test.to_numpy().reshape((X_new_ori_test.shape[0], 10))
y_new_ori_test = np.ravel(y_new_ori_test.to_numpy().reshape((y_new_ori_test.shape[0], 1)))

In [11]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=3)
# X_ori = pca.fit_transform(X_train)
# X_aug = pca.fit_transform(X_test)
# plot_embedding_3d(X_ori, y_train, title=None)

In [12]:
from sklearn.model_selection import train_test_split
X_aug_train, X_aug_test, y_aug_train, y_aug_test = train_test_split(X_aug, y_aug, test_size=0.1, random_state=42)
X_ori_train, X_ori_test, y_ori_train, y_ori_test = train_test_split(X_ori, y_ori, test_size=0.1, random_state=42)

In [13]:
# X_train = X_aug_train
# X_test = X_aug_test
# y_train = y_aug_train
# y_test = y_aug_test

# X_train = X_ori_train
# X_test = X_ori_test
# y_train = y_ori_train
# y_test = y_ori_test

# X_train = X_new_train
# X_test = X_new_test
# y_train = y_new_train
# y_test = y_new_test

X_train = X_new_ori_train
X_test = X_new_ori_test
y_train = y_new_ori_train
y_test = y_new_ori_test

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

tree= DTC()

param_grid = {'criterion': ['gini', 'entropy'], 
              'min_samples_split': range(2, 11), 
              'min_samples_leaf': range(1, 11)
              }

grid = GridSearchCV(tree, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid.fit(X_train, y_train)

print("Best Parameters: ", grid.best_params_)
print("Best Score: ", grid.best_score_)

tree_best = grid.best_estimator_
y_pred = tree_best.predict(X_test)
print(classification_report(y_test, y_pred))

print(y_pred)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=2; total time=   0.9s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=2; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=2; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=2; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=2; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=3; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=3; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=3; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=3; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=3; total time=   0.8s
[CV] END criterion=gini, min_samples_leaf=1, min_samples_split=4; total time=   0.8s
[C

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
forest = RFC(n_estimators=1000, random_state=0)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred))
print(y_pred)

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(y_pred)

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=5,  
    max_depth=10, 
    learning_rate=0.1, 
    n_estimators=200  
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print(y_pred)

In [None]:
from sklearn .svm import SVC
from sklearn.preprocessing import StandardScaler as SC
from sklearn.metrics import accuracy_score
import pandas as pd
data1=pd.DataFrame(X_train)
data2=pd.DataFrame(X_test)
model=SVC(kernel='rbf',C=1000)
res=model.fit(data1,y_train)
res.predict(data2)
accuracy_score(res.predict(data2),y_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print(y_pred)