In [None]:
# installing necessary libraries
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from Bio.SeqUtils import ProtParam
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score, KFold

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, make_scorer, precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics as mt

import warnings
import sys
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# loading the dataset
df = pd.read_csv('/content/phospho_dataset.csv')

In [None]:
df # displaying the dataset

Unnamed: 0,Extracted_Sequence,Target
0,kfledmsyltlkanc,0
1,snpsyrtstqevkle,0
2,plvdpsvygygvqkr,0
3,kinllihvgcalerm,1
4,rirpqdsycphcgyy,1
...,...,...
57230,talyftfssltsvgf,1
57231,lhfirfpscamhrfi,0
57232,rvlnrkssiiivnrn,0
57233,pdqappsrrrrsdwa,1


In [None]:
df['length'] = df['Extracted_Sequence'].apply(len) # calculating the length of the sequence

In [None]:
# sampling dataset with replacement
data1 = df.sample(frac=0.2, replace=True)
data2 = df.sample(frac=0.2, replace=True)
data3 = df.sample(frac=0.2, replace=True)
data4 = df.sample(frac=0.2, replace=True)
data5 = df.sample(frac=0.2, replace=True)
data6 = df.sample(frac=0.2, replace=True)

In [None]:
# function to compute amiono acid composition
def compute_aa_composition_features(data):
    features = []
    for seq in data['Extracted_Sequence']:

        aa_comp = ProtParam.ProteinAnalysis(str(seq)).get_amino_acids_percent()

        features.append(list(aa_comp.values()))

    return np.array(features)



In [None]:
# function to extract features using protparam (approach1)
def extract_feature(df):
  df_aa = compute_aa_composition_features(df)
  vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,2)) # convert sequence to bigram
  df_vectorized = vectorizer.fit_transform(df['Extracted_Sequence'])
  df_vectorized = np.hstack([df_vectorized.toarray(), df['length'].values.reshape(-1,1), df_aa])
  df_target = df['Target']
  return df_vectorized, df_target

# **ML**

In [None]:
# function to print evaluation metrices i.e. precision, recall, f1 score, accuracy with k-fold cross validation
def print_metrics(clf, x, y, cv):
  print("\nCross validation results by"+ str(cv)+"\n")
  scoring = {
      'precision': make_scorer(precision_score),
      'recall': make_scorer(recall_score),
      'f1_score': make_scorer(f1_score)
  }
  cross_val_results = cross_validate(clf, x, y, cv=cv, scoring = scoring)
  accuracy = cross_val_score(clf, x, y, cv=cv, scoring = 'f1_macro')
  print("Average Precision:", cross_val_results['test_precision'].mean())
  print("Average Recall:", cross_val_results['test_recall'].mean())
  print("Average F1 Score:", cross_val_results['test_f1_score'].mean())
  print("Average Accuracy Score:", accuracy.mean())

In [None]:
# function to print evaluation metrices i.e. precision, recall, f1 score, accuracy, confusion matrix and the classification report with test train split cross validation
def metric_calculation(y_test,y_pred):
  print("train_test split cross validation\n")
  print("Accuracy  :  ",mt.accuracy_score(y_test,y_pred))
  print("Precision :  ",mt.precision_score(y_test,y_pred))
  print("Recall    :  ",mt.recall_score(y_test,y_pred))
  print("F1-score  :  ",mt.f1_score(y_test,y_pred))
  print("Confusion matrix      : \n ",mt.confusion_matrix(y_test,y_pred))
  print("Classification report : \n",mt.classification_report(y_test,y_pred))

In [None]:
# function to train the model
def train_model(data, target, model):
  x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  metric_calculation(y_test,y_pred)
  # num_folds = 10
  # kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
  # str_kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
  # print_metrics(model, df_vectorized, df['Target'], kf)
  # print_metrics(model, df_vectorized, df['Target'], str_kf)

In [None]:
# objects of different classifiers
rf = RandomForestClassifier()
nb = MultinomialNB()
mlp = MLPClassifier()
sgd = SGDClassifier(loss="log")
svc = SVC(probability=True)
lr = LogisticRegression()

In [None]:
# extracting features and target using protparam approach1
data, target = extract_feature(df)
data = pd.DataFrame(data)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,415,416,417,418,419,420,421,422,423,424
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.066667,0.066667,0.000000,0.000000,0.000000,0.066667,0.066667,0.000000,0.000000,0.066667
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.066667,0.066667,0.066667,0.066667,0.200000,0.133333,0.066667,0.000000,0.066667
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.133333,0.066667,0.066667,0.066667,0.000000,0.200000,0.000000,0.133333
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210399,...,0.066667,0.066667,0.000000,0.000000,0.066667,0.000000,0.000000,0.066667,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.133333,0.066667,0.133333,0.066667,0.000000,0.000000,0.000000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243195,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.200000,0.200000,0.066667,0.000000,0.066667
57231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.066667,0.000000,0.066667,0.000000,0.133333,0.066667,0.000000,0.000000,0.000000,0.000000
57232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.200000,0.000000,0.000000,0.200000,0.133333,0.000000,0.133333,0.000000,0.000000
57233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.200000,0.066667,0.266667,0.133333,0.000000,0.000000,0.066667,0.000000


In [None]:
# saving dataset
protparam_dataset_1 = pd.concat([data,target], axis=1)
protparam_dataset_1.to_csv('protparam_features_dataset_1.csv')

In [None]:
# extracting features from first approach and training with random forest
data, target = extract_feature(data1)
data = pd.DataFrame(data)
train_model(data,target, rf)

train_test split cross validation

Accuracy  :   0.7480349344978166
Precision :   0.7437137330754352
Recall    :   0.7113783533765032
F1-score  :   0.7271867612293145
Confusion matrix      : 
  [[944 265]
 [312 769]]
Classification report : 
               precision    recall  f1-score   support

           0       0.75      0.78      0.77      1209
           1       0.74      0.71      0.73      1081

    accuracy                           0.75      2290
   macro avg       0.75      0.75      0.75      2290
weighted avg       0.75      0.75      0.75      2290



In [None]:
# extracting features from first approach and training with naive bayes
data, target = extract_feature(data2)
train_model(data,target, nb)

train_test split cross validation

Accuracy  :   0.6576419213973799
Precision :   0.625
Recall    :   0.6044487427466151
F1-score  :   0.6145526057030483
Confusion matrix      : 
  [[881 375]
 [409 625]]
Classification report : 
               precision    recall  f1-score   support

           0       0.68      0.70      0.69      1256
           1       0.62      0.60      0.61      1034

    accuracy                           0.66      2290
   macro avg       0.65      0.65      0.65      2290
weighted avg       0.66      0.66      0.66      2290



In [None]:
# extracting features from first approach and training with MLP
data, target = extract_feature(data3)
train_model(data,target, mlp)

train_test split cross validation

Accuracy  :   0.6877729257641921
Precision :   0.6859099804305284
Recall    :   0.6401826484018265
F1-score  :   0.6622579121398205
Confusion matrix      : 
  [[874 321]
 [394 701]]
Classification report : 
               precision    recall  f1-score   support

           0       0.69      0.73      0.71      1195
           1       0.69      0.64      0.66      1095

    accuracy                           0.69      2290
   macro avg       0.69      0.69      0.69      2290
weighted avg       0.69      0.69      0.69      2290



In [None]:
# extracting features from first approach and training with SGD
data, target = extract_feature(data4)
train_model(data,target, sgd)

train_test split cross validation

Accuracy  :   0.5899563318777292
Precision :   0.5386638611257233
Recall    :   0.9429097605893186
F1-score  :   0.6856377636424507
Confusion matrix      : 
  [[ 327  877]
 [  62 1024]]
Classification report : 
               precision    recall  f1-score   support

           0       0.84      0.27      0.41      1204
           1       0.54      0.94      0.69      1086

    accuracy                           0.59      2290
   macro avg       0.69      0.61      0.55      2290
weighted avg       0.70      0.59      0.54      2290



In [None]:
# extracting features from first approach and training with SVM
data, target = extract_feature(data5)
train_model(data,target, svc)

train_test split cross validation

Accuracy  :   0.5580786026200873
Precision :   0.7180327868852459
Recall    :   0.19126637554585152
F1-score  :   0.3020689655172414
Confusion matrix      : 
  [[1059   86]
 [ 926  219]]
Classification report : 
               precision    recall  f1-score   support

           0       0.53      0.92      0.68      1145
           1       0.72      0.19      0.30      1145

    accuracy                           0.56      2290
   macro avg       0.63      0.56      0.49      2290
weighted avg       0.63      0.56      0.49      2290



In [None]:
# extracting features from first approach and training with Logistic Regression
data, target = extract_feature(data6)
train_model(data,target, lr)

train_test split cross validation

Accuracy  :   0.6777292576419214
Precision :   0.6932849364791288
Recall    :   0.6563573883161512
F1-score  :   0.6743159752868491
Confusion matrix      : 
  [[788 338]
 [400 764]]
Classification report : 
               precision    recall  f1-score   support

           0       0.66      0.70      0.68      1126
           1       0.69      0.66      0.67      1164

    accuracy                           0.68      2290
   macro avg       0.68      0.68      0.68      2290
weighted avg       0.68      0.68      0.68      2290



In [None]:
from sklearn.ensemble import VotingClassifier  # importing voting classifier

In [None]:
# defining base estimators
estimator = []
estimator.append(('rf',  rf))
estimator.append(('svc', svc))
estimator.append(('mlp', mlp))
estimator.append(('sgd', sgd))
estimator.append(('lr', lr))
estimator.append(('nb', nb))

In [None]:
# train the complete dataset using voting classifier
vc= VotingClassifier(estimators = estimator, voting ='soft')
train_model(data,target, vc)

train_test split cross validation

Accuracy  :   0.7362445414847162
Precision :   0.7417962003454232
Recall    :   0.7379725085910653
F1-score  :   0.7398794142980188
Confusion matrix      : 
  [[827 299]
 [305 859]]
Classification report : 
               precision    recall  f1-score   support

           0       0.73      0.73      0.73      1126
           1       0.74      0.74      0.74      1164

    accuracy                           0.74      2290
   macro avg       0.74      0.74      0.74      2290
weighted avg       0.74      0.74      0.74      2290



# **DL**

In [None]:
# importing libraries for DL
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import decomposition as decom
from keras.metrics import Precision, Recall

In [None]:
# function to train DL model and print the evaluation metrics
def train_model_dl(data, target, opt, epoch, batch):
  x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
  model = Sequential()
  model.add(Dense(256, input_shape=(x_train.shape[1],), activation='relu'))
  model.add(Dense(128, activation='tanh'))
  model.add(Dense(64, activation='tanh'))
  model.add(Dense(32, activation='tanh'))
  model.add(Dense(16, activation='tanh'))
  model.add(Dense(8, activation='tanh'))
  model.add(Dense(4, activation='tanh'))
  model.add(Dense(2, activation='tanh'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', Precision(), Recall()])
  model.fit(x_train, y_train, epochs=epoch, batch_size=batch)
  _, accuracy,precision,recall = model.evaluate(x_test, y_test, verbose=0)
  try:
    f1_score = (2*precision*recall)/(precision+recall)
  except:
    f1_score = (2*precision*recall)
  print("Accuracy  : ", accuracy)
  print("Precision : ", precision)
  print("Recall    : ", recall)
  print("F1-Score  : ", f1_score)

In [None]:
# extracting features from first approach and training with DL model
data, target = extract_feature(df)
train_model_dl(data, target, "adadelta", 200, 512)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# function to split amino_acid_composition feature to multiple features
def expand_amino_acid_composition(data):
    return pd.Series(data['amino_acid_composition'])
# function to split secondary_structure_fraction feature to multiple features
def expand_secondary_structure_fraction(data):
    return pd.Series(data['secondary_structure_fraction'])

In [None]:
# function to extract features using protparam (approach2)
def get_features(data):
  features = []
  for seq in data['Extracted_Sequence']:
    try:
      # extracting features
      aa_comp = ProtParam.ProteinAnalysis(str(seq)).get_amino_acids_percent()
      mol_wt = ProtParam.ProteinAnalysis(str(seq)).molecular_weight()
      aroma = ProtParam.ProteinAnalysis(str(seq)).aromaticity()
      insta_ind = ProtParam.ProteinAnalysis(str(seq)).instability_index()
      isoelec_pnt = ProtParam.ProteinAnalysis(str(seq)).isoelectric_point()
      sec_struc_frac = ProtParam.ProteinAnalysis(str(seq)).secondary_structure_fraction()
      feature = {
          "amino_acid_composition": aa_comp,
          "molecular_weight": mol_wt,
          "aromaticity": aroma,
          "instability_index": insta_ind,
          "isoelectric_point": isoelec_pnt,
          "secondary_structure_fraction": sec_struc_frac
      }
    except:
      pass
    features.append(feature)
  features = pd.DataFrame(features)
  amino_comp_df = features.apply(expand_amino_acid_composition, axis=1)
  secondary_structure_fraction_df = features.apply(expand_secondary_structure_fraction, axis=1)
  secondary_structure_fraction_df.columns = ['secondary_structure_fraction_x','secondary_structure_fraction_y','secondary_structure_fraction_z']
  features = pd.concat([features, amino_comp_df, secondary_structure_fraction_df], axis=1)
  features.drop(['amino_acid_composition', 'secondary_structure_fraction'], axis=1, inplace=True)
  return features, data['Target']

In [None]:
# extracting features and target using protparam approach2
data, target  = get_features(df)
data

Unnamed: 0,molecular_weight,aromaticity,instability_index,isoelectric_point,A,C,D,E,F,G,...,Q,R,S,T,V,W,Y,secondary_structure_fraction_x,secondary_structure_fraction_y,secondary_structure_fraction_z
0,1776.0817,0.133333,43.886667,6.061162,0.066667,0.066667,0.066667,0.066667,0.066667,0.000000,...,0.000000,0.000000,0.066667,0.066667,0.000000,0.000000,0.066667,0.533333,0.200000,0.400000
1,1738.8498,0.066667,9.213333,6.005972,0.000000,0.000000,0.000000,0.133333,0.000000,0.000000,...,0.066667,0.066667,0.200000,0.133333,0.066667,0.000000,0.066667,0.266667,0.333333,0.333333
2,1677.8972,0.133333,11.880000,8.902649,0.000000,0.000000,0.066667,0.000000,0.000000,0.133333,...,0.066667,0.066667,0.066667,0.000000,0.200000,0.000000,0.133333,0.133333,0.400000,0.400000
3,1710.1163,0.000000,6.240000,8.231146,0.066667,0.066667,0.000000,0.066667,0.000000,0.066667,...,0.000000,0.066667,0.000000,0.000000,0.066667,0.000000,0.000000,0.466667,0.133333,0.400000
4,1858.0646,0.200000,76.100000,8.042447,0.000000,0.133333,0.066667,0.000000,0.000000,0.066667,...,0.066667,0.133333,0.066667,0.000000,0.000000,0.000000,0.200000,0.000000,0.333333,0.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57230,1640.8289,0.266667,24.740000,5.184876,0.066667,0.000000,0.000000,0.000000,0.200000,0.066667,...,0.000000,0.000000,0.200000,0.200000,0.066667,0.000000,0.066667,0.200000,0.266667,0.666667
57231,1875.2688,0.200000,88.020000,10.352349,0.066667,0.066667,0.000000,0.000000,0.200000,0.000000,...,0.000000,0.133333,0.066667,0.000000,0.000000,0.000000,0.000000,0.200000,0.133333,0.400000
57232,1782.0995,0.000000,24.740000,11.999968,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.200000,0.133333,0.000000,0.133333,0.000000,0.000000,0.133333,0.333333,0.400000
57233,1794.9280,0.066667,189.500000,11.523289,0.133333,0.000000,0.133333,0.000000,0.000000,0.000000,...,0.066667,0.266667,0.133333,0.000000,0.000000,0.066667,0.000000,0.133333,0.466667,0.066667


In [None]:
# saving dataset
protparam_dataset_2 = pd.concat([data,target], axis=1)
protparam_dataset_2.to_csv('protparam_features_dataset_2.csv')

In [None]:
# scaling the dataset
sc = StandardScaler()
data = pd.DataFrame(sc.fit_transform(data))
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.848501,0.972899,-0.202420,-0.448158,-0.028373,1.983244,0.242326,-0.003878,0.988494,-0.909672,...,-0.729270,-0.967121,-0.566084,-0.025716,-0.890686,-0.302629,0.669395,1.766328,-0.965916,0.735952
1,0.569682,0.030539,-1.026712,-0.469666,-0.935374,-0.346494,-0.834830,0.893617,-0.602830,-0.909672,...,0.540207,-0.002245,0.657754,0.940067,0.218443,-0.302629,0.669395,-0.195765,-0.080520,0.226248
2,0.113224,0.972899,-0.963317,0.659175,-0.935374,-0.346494,0.242326,-0.901372,-0.602830,0.826194,...,0.540207,-0.002245,-0.566084,-0.991499,2.436699,-0.302629,1.990636,-1.176812,0.362178,0.735952
3,0.354504,-0.911822,-1.097398,0.397489,-0.028373,1.983244,-0.834830,-0.003878,-0.602830,-0.041739,...,-0.729270,-0.002245,-1.178003,-0.991499,0.218443,-0.302629,-0.651847,1.275804,-1.408613,0.735952
4,1.462449,1.915260,0.563391,0.323953,-0.935374,4.312982,0.242326,-0.901372,-0.602830,-0.041739,...,0.540207,0.962631,-0.566084,-0.991499,-0.890686,-0.302629,3.311878,-2.157859,-0.080520,-0.283457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57230,-0.164370,2.857620,-0.657595,-0.789649,-0.028373,-0.346494,-0.834830,-0.901372,4.171143,-0.041739,...,-0.729270,-0.967121,0.657754,1.905851,0.218443,-0.302629,0.669395,-0.686289,-0.523218,2.774769
57231,1.591286,1.915260,0.846766,1.224126,-0.028373,1.983244,-0.834830,-0.901372,4.171143,-0.909672,...,-0.729270,0.962631,-0.566084,-0.991499,-0.890686,-0.302629,-0.651847,-0.686289,-1.408613,0.735952
57232,0.893567,-0.911822,-0.657595,1.866207,-0.935374,-0.346494,-0.834830,-0.901372,-0.602830,-0.909672,...,-0.729270,1.927507,0.045835,-0.991499,1.327571,-0.302629,-0.651847,-1.176812,-0.080520,0.735952
57233,0.989636,0.030539,3.259260,1.680444,0.878628,-0.346494,1.319482,-0.901372,-0.602830,-0.909672,...,0.540207,2.892383,0.045835,-0.991499,-0.890686,2.869210,-0.651847,-1.176812,0.804876,-1.812569


In [None]:
# taining random forest on features obtained from approach 2
train_model(data,target, rf)

In [None]:
# taining MLP on features obtained from approach 2
train_model(data,target, mlp)

In [None]:
# taining Logistic regression on features obtained from approach 2
train_model(data,target, lr)

train_test split cross validation

Accuracy  :   0.6372848781340089
Precision :   0.6249302325581395
Recall    :   0.6112829845313922
F1-score  :   0.61803127874885
Confusion matrix      : 
  [[3936 2016]
 [2136 3359]]
Classification report : 
               precision    recall  f1-score   support

           0       0.65      0.66      0.65      5952
           1       0.62      0.61      0.62      5495

    accuracy                           0.64     11447
   macro avg       0.64      0.64      0.64     11447
weighted avg       0.64      0.64      0.64     11447



In [None]:
# taining SGD on features obtained from approach 2
train_model(data,target, sgd)

train_test split cross validation

Accuracy  :   0.6204245653883114
Precision :   0.5984926344638575
Recall    :   0.6358507734303913
F1-score  :   0.6166063707756111
Confusion matrix      : 
  [[3608 2344]
 [2001 3494]]
Classification report : 
               precision    recall  f1-score   support

           0       0.64      0.61      0.62      5952
           1       0.60      0.64      0.62      5495

    accuracy                           0.62     11447
   macro avg       0.62      0.62      0.62     11447
weighted avg       0.62      0.62      0.62     11447



In [None]:
# training dl model with different optimizer with different batch sizes and for different no. of itterations

In [None]:
train_model_dl(data, target, "rmsprop", 200, 512)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
train_model_dl(data, target, "rmsprop", 200, 256)

In [None]:
train_model_dl(data, target, "rmsprop", 200, 128)

In [None]:
train_model_dl(data, target, "rmsprop", 200, 64)

In [None]:
train_model_dl(data, target, "rmsprop", 100, 512)

In [None]:
train_model_dl(data, target, "rmsprop", 100, 256)

In [None]:
train_model_dl(data, target, "rmsprop", 100, 128)

In [None]:
train_model_dl(data, target, "rmsprop", 100, 64)

In [None]:
train_model_dl(data, target, "adam", 200, 512)

In [None]:
train_model_dl(data, target, "adam", 200, 256)

In [None]:
train_model_dl(data, target, "adam", 200, 128)

In [None]:
train_model_dl(data, target, "adam", 200, 64)

In [None]:
train_model_dl(data, target, "adam", 100, 512)

In [None]:
train_model_dl(data, target, "adam", 100, 256)

In [None]:
train_model_dl(data, target, "adam", 100, 128)

In [None]:
train_model_dl(data, target, "adam", 100, 64)