In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
# Imports
import tensorflow as tf
from tensorflow.keras.layers import Activation, Dense, LSTM, Bidirectional
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
from collections import Counter
from google.colab import files
import pickle
import joblib
from keras.models import load_model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
labels_file = '/content/drive/MyDrive/y.txt'
embeddings_file = '/content/drive/MyDrive/x.npy'

In [None]:
def load_dataset_files(labels_file, embeddings_file):
    x = np.load(embeddings_file)
    y = pd.read_csv(labels_file, sep = " ", header=None, names=["Complexity", "Node Number", "Language"])
    return x,y

In [None]:
x,y = load_dataset_files(labels_file,embeddings_file)

In [None]:
model = load_model('try_model1.sav')

In [None]:
sc = pickle.load(open('/content/scaler.pkl', 'rb'), encoding='latin1')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 128, 128)         33792     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
Total params: 133,898
Trainable params: 133,898
Non-trainable params: 0
_________________________________________________________________


In [None]:
test = np.load('x_new.npy')

In [None]:
test_df = pd.DataFrame(test,columns=[("emb"+str(num)) for num in range(1,129)]) # Dataframe to get all embeddings program wise
test_df

Unnamed: 0,emb1,emb2,emb3,emb4,emb5,emb6,emb7,emb8,emb9,emb10,...,emb119,emb120,emb121,emb122,emb123,emb124,emb125,emb126,emb127,emb128
0,0.021076,0.01483,0.006355,-0.012587,-0.012092,-0.009924,-0.004592,0.000111,-0.017801,-0.014113,...,0.010925,-0.010274,0.001091,0.006206,-0.002706,0.014532,-0.001408,0.007295,0.003955,0.010714
1,0.012322,0.013632,-0.001973,-0.005671,0.000956,-0.006729,-0.001627,-0.001882,-0.01638,-0.004526,...,0.002996,-0.001518,0.002344,0.00987,-0.000752,0.009357,-0.0068,-0.001993,0.004947,0.003175
2,0.015365,0.00343,-0.007474,0.004213,0.004251,-0.00982,0.002008,-0.008425,-0.013864,-0.002914,...,0.007031,9.7e-05,-0.008999,-0.002379,-0.008652,0.000502,0.00635,-0.003777,-0.007682,0.011202
3,0.014068,-0.001454,-0.003478,-0.008489,-0.007731,-0.003517,0.001984,-0.010024,-0.015823,-0.003569,...,0.007584,-0.006851,-0.002469,0.003041,-0.010411,-0.002885,0.000819,0.003836,-0.00195,0.002814
4,0.010489,-0.003113,-0.007001,-0.00191,0.00032,-0.001886,0.002533,-0.00727,-0.000168,-0.008044,...,0.008701,-0.003816,-0.006118,-0.001984,-0.005639,0.009326,0.005896,-0.000152,-0.00404,-0.003463
5,0.008928,-0.000462,-0.003368,-0.000461,-0.007164,0.002076,0.00099,-0.007319,0.004883,0.003731,...,0.004969,0.001808,0.001342,-0.006651,0.002375,-0.004722,-0.001487,-0.007256,0.004683,-0.004125


In [None]:
def method1(x,y,labels_file):
    # Creating Dataframes
    df = pd.DataFrame(x,columns=[("emb"+str(num)) for num in range(1,129)]) # Dataframe to get all embeddings program wise
    df_y = pd.read_csv(labels_file, sep = " ", header=None, names=["Complexity", "Node Number", "Language"])
    df['Complexity'] = df_y['Complexity'] # Adding y label to each corresponding program's embeddings

    # Resampling - Oversampling 

    #create two different dataframe of majority and minority class 
    df_majority = df[(df['Complexity'] == 'O(N)')] 
    df_minority_1 = df[(df['Complexity'] == 'O(N2)')] 
    df_minority_2 = df[(df['Complexity'] == 'O(NlogN)')] 
    df_minority_3 = df[(df['Complexity'] == 'O(logN)')] 
    df_minority_4 = df[(df['Complexity'] == 'O(N3)')]
    df_minority_5 = df[(df['Complexity'] == 'O(Nd)')]
    df_minority_6 = df[(df['Complexity'] == 'O(2n)')]
    df_minority_7 = df[(df['Complexity'] == 'O(1)')]
    df_minority_8 = df[(df['Complexity'] == 'O(N!)')]
    df_minority_9 = df[(df['Complexity'] == 'O(sqrt(N))')]

    # upsample minority class
    minority = [df_minority_1, df_minority_2, df_minority_3, df_minority_4, df_minority_5, df_minority_6, df_minority_7, df_minority_8, df_minority_9]
    temp = []
    for i in minority:
      df_minority_upsampled = resample(i, 
                                      replace=True,    # sample with replacement
                                      n_samples= 375, # to match majority class
                                      random_state=42)  # reproducible results
      temp.append(df_minority_upsampled)
    # Combine majority class with upsampled minority class
    df_upsampled = df_majority
    for i in temp:
      df_upsampled = pd.concat([i, df_upsampled])

    new_df = df_upsampled
    # df = oversampled
    factor = pd.factorize(new_df['Complexity'])
    new_df.Complexity= factor[0]
    definitions = factor[1]
    #print(new_df.Complexity.head())
    print(definitions)

    #Splitting the data into independent and dependent variables
    X = new_df.iloc[:,0:128].values
    Y = new_df.iloc[:,128].values
    #print('The independent features set: ')
    #print(X[:5,:])
    #print('The dependent variable: ')
    #print(Y[:5])

    Y = new_df['Complexity']

    le = LabelEncoder()
    Y = le.fit_transform(Y)

    # Creating the Training and Test set from data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 21)

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return df, df_y, df_upsampled, new_df, X, Y, X_train, X_test, Y_train, Y_test

In [None]:
df, df_y, df_upsampled, new_df, X, Y, X_train, X_test, Y_train, Y_test = method1(x,y,labels_file)
#df
#df_y
#df_upsampled['Complexity'].value_counts()
#new_df

Index(['O(sqrt(N))', 'O(N!)', 'O(1)', 'O(2n)', 'O(Nd)', 'O(N3)', 'O(logN)',
       'O(NlogN)', 'O(N2)', 'O(N)'],
      dtype='object')


In [None]:
score = model.evaluate(X_test, Y_test, verbose=2)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

30/30 - 2s - loss: 0.6620 - accuracy: 0.7793 - 2s/epoch - 81ms/step
accuracy: 77.93%


In [None]:
test_df= sc.transform(test_df)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [None]:
predictions = model.predict(test_df)



In [None]:
predictions

array([[0.3361897 , 0.00310076, 0.42631158, 0.00116578, 0.01621369,
        0.00199564, 0.12755889, 0.02996575, 0.03226751, 0.02523069],
       [0.36134464, 0.00291728, 0.40989882, 0.00109764, 0.01630774,
        0.0017826 , 0.12006225, 0.02962764, 0.0315083 , 0.02545315],
       [0.35755494, 0.00207582, 0.4133437 , 0.0008589 , 0.01533732,
        0.00130917, 0.13270207, 0.02664459, 0.02735222, 0.02282122],
       [0.35722992, 0.00223847, 0.4198606 , 0.00094705, 0.01591783,
        0.0014681 , 0.12054495, 0.02864613, 0.0289319 , 0.02421501],
       [0.35195544, 0.00247339, 0.42580003, 0.00103604, 0.01553075,
        0.00158883, 0.12026413, 0.02824358, 0.02895606, 0.02415164],
       [0.3665009 , 0.00218229, 0.41046497, 0.00092251, 0.01561904,
        0.00139655, 0.12363899, 0.02735359, 0.02839713, 0.023524  ]],
      dtype=float32)

we used the Softmax activation function when creating our model. This activation function doesn't compute the prediction, but rather a discrete probability distribution over the target classes. In simple English, this means that Softmax computes the probability that the input belongs to a particular class, for each class.