### Pre-Processing

In [37]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm.notebook import tqdm
import pickle
import warnings
import spacy
import nltk
import regex as re
from nltk.corpus import stopwords
from collections import Counter
from string import punctuation
from nltk.tokenize import word_tokenize
warnings.filterwarnings('ignore')
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_imbal_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, recall_score,plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
%matplotlib inline


In [38]:
# Read df
df = pd.read_csv('mbti_1.csv')

# Global Vars
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
cachedStopWords = stopwords.words("english")
types = df['type'].tolist()
set_types = set([i.lower() for i in types])
print(set_types)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'esfp', 'esfj', 'isfj', 'intj', 'enfj', 'infp', 'entp', 'enfp', 'istj', 'entj', 'intp', 'estj', 'istp', 'estp', 'isfp', 'infj'}


#### Text Pre-processing & Cleaning

In [39]:
def remove_stop(row):
  global cachedStopWords
  global set_types

  row = ' '.join([word for word in row.split() if word not in cachedStopWords])
  row = ' '.join([word for word in row.split() if word not in set_types])
  return row

In [40]:
def lemmatize(row):
  doc = nlp(row)
  return ' '.join([token.lemma_ for token in doc])

In [41]:
def get_keywords(text):
    keywords = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text) 
    for token in doc:

        if(token.text in punctuation):
            keywords.append(token.text)

        if(token.pos_ in pos_tag):
            keywords.append(token.text)
    return ' '.join(word for word in keywords)

In [42]:
def remove_unwanted_space(text):
    val1 = '.'
    sentences = text.split('.')
    updated_sentences = []
    for sentence in sentences:
        updated_sentences.append(sentence.strip())
    try:
        while True:
            updated_sentences.remove(val1)
    except ValueError:
        pass
    val2 = ''
    try:
        while True:
            updated_sentences.remove(val2)
    except ValueError:
        pass
    updated_text = ". ".join(updated_sentences)
    return updated_text

In [43]:
def process_text(df):

  df['posts'] = df['posts'].apply(lambda x: x.lower())
  df['posts'] = df['posts'].apply(lambda x: re.sub(r'http\S+', '', x))
  df['posts'] = df['posts'].apply(lambda x: x.replace("'", ""))

  df['posts'] = df['posts'].apply(lambda x: re.sub(r'[^ a-z\.]+', '', x))
  df['posts'] = df['posts'].apply(lambda x: remove_stop(x))
  df['posts'] = df['posts'].apply(lambda x: lemmatize(x))
  df['posts'] = df['posts'].apply(lambda x: get_keywords(x))
  df['posts'] = df['posts'].apply(lambda x: remove_unwanted_space(x))
  return df

In [44]:
df = process_text(df)

In [45]:
df.to_csv('preprocessed.csv')

#### Word Vector Embeddings

In [46]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd
print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

--2022-12-04 16:25:37--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-12-04 16:25:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-12-04 16:25:38--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [47]:
def do_embedding(row):
  vector_list = []
  for word in row:
    try:
      vector_list.append(embeddings_index[word])
    except:
      pass
  return vector_list

def word_embeddings(df):
  df['vectors'] = df['posts'].apply(lambda x: do_embedding(x))
  return df

In [48]:
# Save df
vectorized_df = word_embeddings(df)
vectorized_df.to_pickle("vectorized.pkl")

In [49]:
print(len(df), len(vectorized_df))
vectorized_df

8675 8675


Unnamed: 0,type,posts,vectors
0,INFJ,moment sportscenter top prankswhat lifechangin...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
1,ENTP,lack post alarming. sex bore position. example...,"[[-0.45433, 1.0234, 0.024278, -0.086367, -0.69..."
2,INTP,good course blessing positive good friend amaz...,"[[-0.37628, 0.37102, 0.32594, -0.085084, -0.55..."
3,INTJ,dear conversation day. esoteric gabbing nature...,"[[-0.91091, 0.50459, 0.058175, -0.78618, 0.088..."
4,ENTJ,silly misconception. approach key unlocking en...,"[[0.13739, 0.77891, 0.80054, 0.13819, -0.49792..."
...,...,...,...
8670,ISFP,cat fi dom reason. website neo nazis perc. im ...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."
8671,ENFP,thread someplace hereooop hard movie watch thr...,"[[0.13482, 0.40224, -0.42266, -0.055631, -0.55..."
8672,INTP,many question thing. purple pill. win lottery ...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
8673,INFP,conflict right child. maternal instinct. none ...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."


In [52]:
# Re-Read Saved File (used when memory is a bottleneck)
df = pd.read_pickle("vectorized.pkl")  

In [53]:
df

Unnamed: 0,type,posts,vectors
0,INFJ,moment sportscenter top prankswhat lifechangin...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
1,ENTP,lack post alarming. sex bore position. example...,"[[-0.45433, 1.0234, 0.024278, -0.086367, -0.69..."
2,INTP,good course blessing positive good friend amaz...,"[[-0.37628, 0.37102, 0.32594, -0.085084, -0.55..."
3,INTJ,dear conversation day. esoteric gabbing nature...,"[[-0.91091, 0.50459, 0.058175, -0.78618, 0.088..."
4,ENTJ,silly misconception. approach key unlocking en...,"[[0.13739, 0.77891, 0.80054, 0.13819, -0.49792..."
...,...,...,...
8670,ISFP,cat fi dom reason. website neo nazis perc. im ...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."
8671,ENFP,thread someplace hereooop hard movie watch thr...,"[[0.13482, 0.40224, -0.42266, -0.055631, -0.55..."
8672,INTP,many question thing. purple pill. win lottery ...,"[[0.29492, 0.56874, -0.20245, 0.50244, -0.6829..."
8673,INFP,conflict right child. maternal instinct. none ...,"[[-0.11752, 0.97272, -0.29021, 0.25914, -0.426..."


In [56]:
# Get average of vectors
df1 = pd.DataFrame([],columns=range(100))
for i in tqdm(range(len(df))):
    vec = sum(df['vectors'][i])/len(df['vectors'][1])
    df1.loc[len(df1)] = vec

  0%|          | 0/8675 [00:00<?, ?it/s]

In [57]:
df1['type'] = df['type']

In [58]:
display(df1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.097970,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,INFJ
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213000,-0.138010,ENTP
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.101500,0.188891,0.134673,-0.329650,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.109500,INTP
3,-0.318508,0.332027,0.115262,-0.138126,-0.339136,0.126303,0.222977,0.166778,-0.336413,0.153932,...,0.363218,0.286102,0.376946,-0.066860,-0.045982,-0.075894,-0.501995,0.188028,-0.108340,INTJ
4,-0.331746,0.340676,0.107578,-0.137671,-0.359856,0.116754,0.237006,0.172544,-0.363693,0.155316,...,0.378910,0.299281,0.402857,-0.061917,-0.049199,-0.078518,-0.526566,0.177255,-0.119679,ENTJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,-0.247700,0.268816,0.084432,-0.099864,-0.275837,0.096442,0.175194,0.134008,-0.280876,0.123045,...,0.314773,0.236080,0.299501,-0.037386,-0.047437,-0.059379,-0.408427,0.146552,-0.088635,ISFP
8671,-0.420970,0.374257,0.130425,-0.155434,-0.380822,0.151617,0.272093,0.179081,-0.418315,0.197848,...,0.487675,0.360578,0.481101,-0.051194,-0.056815,-0.055378,-0.618139,0.201325,-0.156850,ENFP
8672,-0.355597,0.342990,0.119931,-0.139155,-0.340479,0.129696,0.238590,0.177745,-0.358638,0.177998,...,0.394450,0.292837,0.406758,-0.047173,-0.063197,-0.081574,-0.526089,0.222697,-0.133529,INTP
8673,-0.460168,0.445299,0.137463,-0.184071,-0.455202,0.175211,0.311825,0.222498,-0.501991,0.226518,...,0.533003,0.402545,0.544176,-0.073735,-0.084394,-0.095109,-0.706582,0.251552,-0.169031,INFP


### Split data by categories

In [59]:
### SPLIT BY CATEGORIES
ie = ['I', 'E']
sn = ['S', 'N']
tf = ['T', 'F']
jp = ['J', 'P']

In [60]:
print(len(df1))

iedf = df1.copy()
iedf['type'] = df1['type'].apply(lambda x: [i for i in x if i in ie][0])


sndf = df1.copy()
sndf['type'] = df1['type'].apply(lambda x: [i for i in x if i in sn][0])


tfdf = df1.copy()
tfdf['type'] = df1['type'].apply(lambda x: [i for i in x if i in tf][0])


jpdf = df1.copy()
jpdf['type'] = df1['type'].apply(lambda x: [i for i in x if i in jp][0])

8675


In [61]:
# Example Output
jpdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.09797,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,J
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213,-0.13801,P
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.1015,0.188891,0.134673,-0.32965,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.1095,P


In [62]:
iedf.to_csv('ie.csv')
sndf.to_csv('sn.csv')
tfdf.to_csv('tf.csv')
jpdf.to_csv('jp.csv')

In [63]:
# Convert chars to 1 or 0 to allow for classification
iedf['type'] = iedf['type'].replace(list(set(iedf['type'])),range(len(list(set(iedf['type'])))))
sndf['type'] = sndf['type'].replace(list(set(sndf['type'])),range(len(list(set(sndf['type'])))))
tfdf['type'] = tfdf['type'].replace(list(set(tfdf['type'])),range(len(list(set(tfdf['type'])))))
jpdf['type'] = jpdf['type'].replace(list(set(jpdf['type'])),range(len(list(set(jpdf['type'])))))

In [64]:
iedf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.09797,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,0
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213,-0.13801,1
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.1015,0.188891,0.134673,-0.32965,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.1095,0


In [65]:
sndf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.09797,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,0
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213,-0.13801,0
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.1015,0.188891,0.134673,-0.32965,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.1095,0


In [66]:
tfdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.09797,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,0
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213,-0.13801,1
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.1015,0.188891,0.134673,-0.32965,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.1095,1


In [67]:
jpdf.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,type
0,-0.209376,0.198854,0.056219,-0.085437,-0.194152,0.085088,0.143819,0.09797,-0.229019,0.098643,...,0.250383,0.195343,0.252029,-0.029919,-0.023972,-0.048325,-0.315152,0.112291,-0.086202,1
1,-0.361201,0.379274,0.118784,-0.133162,-0.385487,0.121015,0.249944,0.183359,-0.400977,0.177658,...,0.423233,0.311505,0.420498,-0.053132,-0.063658,-0.073184,-0.561471,0.213,-0.13801,0
2,-0.292335,0.288878,0.102278,-0.116338,-0.282279,0.1015,0.188891,0.134673,-0.32965,0.144942,...,0.344705,0.252406,0.348347,-0.046779,-0.040439,-0.040919,-0.442984,0.155134,-0.1095,0


### Train ML Models

In [68]:
def get_SVC(X_dev, y_dev, X_test, y_test):
  
  svc_model1=LinearSVC(loss='hinge',  random_state=42)
  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)
  pipe_svc = make_imbal_pipeline(smote, GridSearchCV(LinearSVC(random_state = 42),
                                  param_grid={'max_iter': [25, 50, 100, 150],
                                              'C': [0.01, 0.1, 0.5, 0.7, 1, 3]},
                                  cv=5,
                                  scoring='f1_micro',
                                  refit=True))
    
  results = pipe_svc.fit(X_dev, y_dev)
  preds = pipe_svc.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))
  

  return preds

In [69]:
def get_RF(X_dev, y_dev, X_test, y_test):
  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)
  pipe_rfc = make_imbal_pipeline(smote, GridSearchCV(RandomForestClassifier(random_state = 42),
                                  param_grid={'ccp_alpha': [0.1, 0.2, 0.3],
                                              'n_estimators': [25, 50, 75],
                                              'max_depth': [3, 5, 7]},
                                  cv = 5,
                                  scoring='f1_micro',
                                  refit = True))
  results = pipe_rfc.fit(X_dev, y_dev)
  preds = pipe_rfc.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))

  
  return preds

In [70]:
def get_HGB(X_dev, y_dev, X_test, y_test):

  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)
  pipe_hgbc = make_imbal_pipeline(smote, GridSearchCV(HistGradientBoostingClassifier(random_state=33),
                                  param_grid={'learning_rate': [0.1, 0.2, 0.3],
                                              'max_iter': [25, 50, 100],
                                              'max_depth': [3, 5, 7]},
                                  cv=5,
                                  scoring='f1_micro',
                                  refit=True))
    
  results = pipe_hgbc.fit(X_dev, y_dev)
  preds = pipe_hgbc.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))

  X_train, X_calib, y_train, y_calib = train_test_split(X_dev, y_dev, test_size=0.2, random_state=19)

  cal_hgb_platt = CalibratedClassifierCV(pipe_hgbc, cv='prefit', method='sigmoid')
  cal_hgb_platt.fit(X_calib, y_calib)
  
  cal_preds = cal_hgb_platt.predict(X_test)
  print(f1_score(y_test, cal_preds, average='micro'))
  return preds, cal_preds


In [71]:
def get_KNN(X_dev, y_dev, X_test, y_test):
  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)

  pipe_KNN = make_imbal_pipeline(smote, GridSearchCV(KNeighborsClassifier(n_jobs=-1),
                                  param_grid={'n_neighbors': list(range(1, 31))},
                                  cv = 10,
                                  scoring='f1_micro',
                                  refit = True))

  results = pipe_KNN.fit(X_dev, y_dev)

  preds = pipe_KNN.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))


  return preds

In [72]:
def get_LR(X_dev, y_dev, X_test, y_test):

  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)
  lr=LogisticRegression(random_state=0)
  solvers = ['newton-cg', 'lbfgs', 'liblinear']
  penalty = ['l2']
  c_values = [100, 10, 1.0, 0.1, 0.01]
  grid = dict(solver=solvers,penalty=penalty,C=c_values)
  cv = 10
  pipe_lr = make_imbal_pipeline(smote,  GridSearchCV(estimator=lr, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_micro'))  
  results = pipe_lr.fit(X_dev, y_dev)
  preds = pipe_lr.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))
  return preds

In [73]:
def get_NB(X_dev, y_dev, X_test, y_test):

  smote = SMOTE(random_state=42)
  smote.fit_resample(X_dev, y_dev)
  gnb = GaussianNB()
  params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
  pipe_nb = make_imbal_pipeline(smote, GridSearchCV(gnb,
                                  param_grid=params_NB,
                                  cv=5,
                                  scoring='f1_micro',
                                  refit=True))
    
  results = pipe_nb.fit(X_dev, y_dev)
  preds = pipe_nb.predict(X_test)
  print(f1_score(y_test, preds, average='micro'))
  return preds

In [74]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Flatten
# from keras.utils import np_utils
# import tensorflow as tf
# from tensorflow import keras

# def get_CNN(X_dev, y_dev, X_test, y_test):
#   X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, 
#                                                   test_size = 0.25,
#                                                   stratify=y,
#                                                   random_state = 42)


#   cnn = Sequential()
#   cnn.add(Dense(128, input_shape=(100,), activation="relu"))
#   cnn.add(Dense(64, activation="relu"))
#   cnn.add(Dense(36, activation="relu"))

#   cnn.add(Dense(101, activation="softmax"))
#   cnn.compile("adam", "sparse_categorical_crossentropy", metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
#   history = cnn.fit(X_train, y_train, batch_size=120, epochs=43, verbose=0, validation_data=(X_val, y_val))
#   print(cnn.evaluate(X_test, y_test))


In [75]:
# Global Scoring variable


In [76]:
 # Run Models for each 

char_list = ['IE', 'SN', 'TF', 'JP']

df_li = [iedf, sndf, tfdf, jpdf]
df_dict = dict()

# For each pair of characteristics
for i in range(len(df_li)):

  # subdf_dict = dict()

  # Get the raw df
  dataframe = df_li[i]

  X = dataframe.drop('type', axis=1)
  y = dataframe['type']

  # First split into dev and test
  X_dev, X_test, y_dev, y_test = train_test_split(X, y, 
                                                    test_size = 0.25,
                                                    stratify=y,
                                                    random_state = 42)

  # Run Classification Models
  svc_preds = get_SVC(X_dev, y_dev, X_test, y_test)
  rf_preds = get_RF(X_dev, y_dev, X_test, y_test)
  hgb_preds, cal_hgb_preds = get_HGB(X_dev, y_dev, X_test, y_test)
  KNN_preds = get_KNN(X_dev, y_dev, X_test, y_test)
  lr_preds=get_LR(X_dev, y_dev, X_test, y_test)
  nb_preds=get_NB(X_dev, y_dev, X_test, y_test)
  # cnn_preds = get_CNN(X_dev, y_dev, X_test, y_test)
  
  # Print Results of Classification models
  print(char_list[i])
  print("SVC")
  print(classification_report(y_test,svc_preds))
  print("Random Forest")
  print(classification_report(y_test,rf_preds))
  print("HistGradientBoost")
  print(classification_report(y_test,hgb_preds))
  print("CalibratedHistGB")
  print(classification_report(y_test,cal_hgb_preds))
  print('KNN')
  print(classification_report(y_test,KNN_preds))
  print("Logistic Regression")
  print(classification_report(y_test,lr_preds))
  print("Naive Bayes")
  print(classification_report(y_test,nb_preds))
  print("CNN")
  # print(classification_report(y_test,cnn_preds))

0.598893499308437
0.23052097740894423
0.6602120792992162
0.698017519594283
0.6302443522360535
0.6108805901337022
0.43614568925772246
IE
SVC
              precision    recall  f1-score   support

           0       0.84      0.59      0.69      1669
           1       0.32      0.64      0.42       500

    accuracy                           0.60      2169
   macro avg       0.58      0.61      0.56      2169
weighted avg       0.72      0.60      0.63      2169

Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1669
           1       0.23      1.00      0.37       500

    accuracy                           0.23      2169
   macro avg       0.12      0.50      0.19      2169
weighted avg       0.05      0.23      0.09      2169

HistGradientBoost
              precision    recall  f1-score   support

           0       0.79      0.77      0.78      1669
           1       0.28      0.30      0.29       500

    accura

### Train Deep Learning Models: RNN (LSTM)

In [77]:
from keras.models import Sequential
import tensorflow as tf
from keras.layers import *
# import tensorflow_addons as tfa

char_list = ['IE', 'SN', 'TF', 'JP']

df_li = [iedf, sndf, tfdf, jpdf]
df_dict = dict()

# For each pair of characteristics
for i in range(len(df_li)):

  # subdf_dict = dict()

  # Get the raw df
  dataframe = df_li[i]

  X = dataframe.drop('type', axis=1)
  y = dataframe['type']


  # First split into dev and test
  X_dev, X_test, y_dev, y_test = train_test_split(X, y, 
                                                    test_size = 0.25,
                                                    stratify=y,
                                                    random_state = 42)
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                    test_size = 0.25,
                                                    random_state = 42)
  




  model = Sequential()
  model.add(LSTM(100, input_shape=(100,1), return_sequences=True))
  model.add(Bidirectional(LSTM(64, return_sequences=True)))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile("adam", "binary_crossentropy", metrics=['accuracy'])
  model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20,batch_size=100, verbose=0)

  y_pred = model.predict(X_test, batch_size=64, verbose=1)
  y_pred_bool = np.argmax(y_pred, axis=1)

  print(classification_report(y_test, y_pred_bool))
  print(f1_score(y_test, y_pred_bool, average='micro'))
  # print(f1_score(y_test, p, average='micro'))
  # print(classification_report(y_test, p))

              precision    recall  f1-score   support

           0       0.77      1.00      0.87      1669
           1       0.00      0.00      0.00       500

    accuracy                           0.77      2169
   macro avg       0.38      0.50      0.43      2169
weighted avg       0.59      0.77      0.67      2169

0.7694790225910558
              precision    recall  f1-score   support

           0       0.86      1.00      0.93      1870
           1       0.00      0.00      0.00       299

    accuracy                           0.86      2169
   macro avg       0.43      0.50      0.46      2169
weighted avg       0.74      0.86      0.80      2169

0.8621484555094514
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      1174
           1       0.00      0.00      0.00       995

    accuracy                           0.54      2169
   macro avg       0.27      0.50      0.35      2169
weighted avg       0.29      0.54   