In [1]:
from google.colab import drive
drive.mount('/content/drive')
% cd /content/drive/MyDrive/HASOC Project Folder/Notebooks/3. English Hindi Codemix/Proposed Approach

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1k3bnVHPHn8aD_7JqJ3TirKGko7aJVa9r/HASOC Project Folder/Notebooks/3. English Hindi Codemix/Proposed Approach


In [2]:
!pip install transformers --quiet
!pip install bert-for-tf2 --quiet
!pip install tensorflow-text --quiet

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import numpy as np
import torch
import pandas as pd
import transformers
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text
from transformers import pipeline
from bert import bert_tokenization
from scipy.spatial import distance
from sklearn.metrics import classification_report
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from transformers import DistilBertTokenizer, DistilBertModel, BertTokenizer, BertModel, AutoTokenizer, AutoModelForMaskedLM

In [2]:
class model:
    def __init__(self, df, model_name, avg_pooling=False):
        self.tokenizer = None
        self.model = None
        self.tokenized_padded_text = None
        self.attention_mask = None
        self.textip = None
        self.pooledOp = None
        self.input_dfs = None
        self.data_frame = df
        self.feature_df = None
        self.model_name = None
        self.InitModel(model_name, avg_pooling)

    def InitModel(self, model_name, avg_pooling):
        

        if model_name == 'distilBert':
            model_class, tokenizer_class, pretrained_weights = (
                DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
            self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
            self.model = model_class.from_pretrained(pretrained_weights)
            self.model_name = 'distilBert'

        if model_name == 'mBert':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
            self.model = BertModel.from_pretrained("bert-base-multilingual-cased")
            self.max_seq_length = 512
            self.model_name = 'mBert'

        if model_name == 'muril':
            self.textip = tf.keras.layers.Input(shape=(), dtype=tf.string)
            self.max_seq_length = 128
            muril_model, muril_layer = self.init_muril(
                model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=self.max_seq_length,
                avg_pooling=avg_pooling)
            vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
            do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
            self.tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
            self.model_name = 'muril'
            self.model = muril_model
            self.avg_pooling = avg_pooling
        if model_name=='xlmr':    
            self.model_name = 'xlmr'
            self.avg_pooling = avg_pooling 
            
    def tokenize(self, column):
        tokenized_text = column.apply((lambda x: self.tokenizer.encode(x,truncation=True,add_special_tokens=True)))
  
        max_len = 0
        for i in tokenized_text.values:
            if len(i) > max_len:
                max_len = len(i)
        self.tokenized_padded_text = np.array([i + [0]*(max_len-len(i)) for i in tokenized_text.values])
        self.create_attention_mask()

    def create_attention_mask(self):
        self.attention_mask = np.where(self.tokenized_padded_text != 0, 1, 0)
        print(type(self.tokenized_padded_text))
        self.input_ids = torch.tensor(self.tokenized_padded_text)
        self.attention_mask = torch.tensor(self.attention_mask)

    def init_muril(self, model_url, max_seq_length, avg_pooling):
        inputs = dict(
            input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
            input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
            input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        )

        muril_layer = hub.KerasLayer(model_url, trainable=True)
        outputs = muril_layer(inputs)
        print(outputs)
        assert 'sequence_output' in outputs
        assert 'pooled_output' in outputs
        assert 'encoder_outputs' in outputs
        assert 'default' in outputs
        if avg_pooling:
            return tf.keras.Model(inputs=inputs, outputs=outputs["encoder_outputs"]), muril_layer
        else:
            return tf.keras.Model(inputs=inputs, outputs=outputs["pooled_output"]), muril_layer

    def create_input(self, input_strings, tokenizer, max_seq_length):
        input_ids_all, input_mask_all, input_type_ids_all = [], [], []
        for input_string in input_strings:
            input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            sequence_length = min(len(input_ids), max_seq_length)

            if len(input_ids) >= max_seq_length:
                input_ids = input_ids[:max_seq_length]
            else:
                input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

            input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

            input_ids_all.append(input_ids)
            input_mask_all.append(input_mask)
            input_type_ids_all.append([0] * max_seq_length)

        return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

    def encode(self, input_text):
        input_ids, input_mask, input_type_ids = self.create_input(input_text,
                                                                  self.tokenizer,
                                                                  self.max_seq_length)
        inputs = dict(
            input_word_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids,
        )
        return self.model(inputs)

    def GetFeatures(self, input=None):
        if self.model_name == 'mBert':
            with torch.no_grad():
                last_hidden_states = self.model(self.input_ids, attention_mask=self.attention_mask)
            last_hidden_states['last_hidden_state'].size()
            self.features = last_hidden_states[0][:, 0, :].numpy()
            self.features = pd.DataFrame(self.features)
        elif self.model_name == 'muril':
            embeddings = self.encode(input)
            if not self.avg_pooling:
                self.features = pd.DataFrame(embeddings.numpy())
            else:
                f1 = embeddings[7][:, 0, :].numpy()
                f2 = embeddings[6][:, 0, :].numpy()
                f3 = embeddings[5][:, 0, :].numpy()
                self.features = pd.DataFrame((f1+f2+f3)/3)
        elif self.model_name == 'xlmr':
            sentences=input.values.tolist()
            nlp = pipeline(task ="feature-extraction", model = 'xlm-roberta-base', tokenizer='xlm-roberta-base', framework='pt', device=0)
            features = nlp(sentences, truncation=True) 
            featurelist=list()
            for i in features:
               featurelist.append(i[0][0])
            self.features=pd.DataFrame(featurelist)        
        return self.features

In [3]:
class classifiers:

  def __init__(self,features_train,label_train,features_test,label_test):
    self.train_features=features_train
    self.train_labels=label_train
    self.test_features=features_test
    self.test_labels=label_test
    self.accuracy=list()
    self.f1score=list()
    self.models=list()
    self.y_pred=list()

  def classify(self,svm=True,random_forest=True,xgboost=True,logistic_regression=True,ann=True)  :
      if svm==True:
        acc,f1_score=self.CreateSVMClassifier()
        self.accuracy.append(acc)
        self.f1score.append(f1_score)
        
        self.models.append('svm')
      if random_forest==True:
        acc,f1_score,y=self.RandomForestClassifier()
        self.accuracy.append(acc)
        self.f1score.append(f1_score)
        self.y_pred.append(y)
        self.models.append('random_forest')
      if xgboost==True:
        acc,f1_score,y=self.XGBClassifier(2)
        self.accuracy.append(acc)
        self.f1score.append(f1_score)
        self.y_pred.append(y)
        self.models.append('xgboost')
      if logistic_regression==True:
        acc,f1_score,y=self.LogisticRegression()
        self.accuracy.append(acc)
        self.f1score.append(f1_score)
        self.y_pred.append(y)
        self.models.append('lr')
      if ann==True:
        acc,f1_score=self.annClassifier()
        self.accuracy.append(acc)
        self.f1score.append(f1_score)
        self.models.append('ann')
      return self.accuracy,self.f1score,self.models,self.y_pred

  def MajorityVotingClassifier(self, num_class):
        acc_xg, f1_xg, y_xgboost = self.XGBClassifier(num_class)
        acc_rf, f1_rf, y_rf = self.RandomForestClassifier()
        acc_rf, f1_rf, y_lr = self.LogisticRegression()
        y_pred = list()
        for i in range(len(y_xgboost)):
            preds = list()
            preds.append(y_xgboost[i])
            preds.append(y_rf[i])
            preds.append(y_lr[i])
            y_pred.append(max(set(preds), key=preds.count))
        cm=confusion_matrix(self.test_labels,y_pred)    
        return accuracy_score(self.test_labels, y_pred), f1_score(self.test_labels, y_pred, average='macro') ,cm,y_pred 

  def XGBClassifier(self,num_class):
     from xgboost import XGBClassifier
     classifier = XGBClassifier(n_estimators=500,learning_rate=1, max_depth=2,objective='multi:softmax',num_class=num_class)
     classifier.fit(self.train_features, self.train_labels)
     
     y_pred = classifier.predict(self.test_features)
     return print(accuracy_score(self.test_labels, y_pred)),print(f1_score(self.test_labels, y_pred,average='macro')),y_pred

  def CreateSVMClassifier(self):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'poly',decision_function_shape='ovr', random_state = 0)
    classifier.fit(self.train_features, self.train_labels)
  
    y_pred = classifier.predict(self.test_features)
    return accuracy_score(self.test_labels, y_pred),f1_score(self.test_labels, y_pred,average='macro')

  def RandomForestClassifier(self):
     from sklearn.ensemble import RandomForestClassifier
     classifier = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0)
     classifier.fit(self.train_features, self.train_labels)

     y_pred = classifier.predict(self.test_features)
     return print(accuracy_score(self.test_labels, y_pred)),print(f1_score(self.test_labels, y_pred,average='macro')),y_pred

  def Createstaticsplit(self,features,labels,split_per=0.8):
   num=np.shape(features)[0]
   self.train_features=features.head(int(split_per*num))
   self.train_labels=labels.head(int(split_per*num))
   self.test_features=features.tail(num-int(split_per*num))
   self.test_labels=labels.tail(num-int(split_per*num))
  def annClassifier(self):
      import tensorflow as tf
      from sklearn.compose import ColumnTransformer
      from sklearn.preprocessing import OneHotEncoder
      ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
      feature_set = np.array(ct.fit_transform(self.features_set))
      train_features, test_features, train_labels, test_labels = train_test_split(feature_set, self.labels)

      ann = tf.keras.models.Sequential()
      ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
      ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
      ann.add(tf.keras.layers.Dense(units=1, activation='softmax'))
      ann.add(tf.keras.layers.Dense(units=1, activation='softmax'))
      ann.add(tf.keras.layers.Dense(units=1, activation='softmax'))
      ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
      ann.fit(train_features, train_labels, batch_size = 32, epochs = 200)

      y_pred = ann.predict(test_features)
      return accuracy_score(test_labels, y_pred),f1_score(test_labels, y_pred,average='macro')

  def LogisticRegression(self):
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression 
    lr_clf = LogisticRegression(multi_class='multinomial')
    lr_clf.fit(self.train_features, self.train_labels) 

    y_pred=lr_clf.predict(self.test_features)
    return print(accuracy_score(self.test_labels, y_pred)),print(f1_score(self.test_labels, y_pred,average='macro')),y_pred

In [4]:
def classify(train_dataframe,test_dataframe,classifier):
   classifier_class=classifiers(features_train=train_dataframe.iloc[:, :-1],label_train=train_dataframe.label,features_test=test_dataframe.iloc[:, :-1],label_test=test_dataframe.label)
   test_labels=test_dataframe.label
   if classifier == 'A':
    accuracies,f1_scores,models,y=classifier_class.classify(svm=True,random_forest=True,xgboost=True,logistic_regression=True,ann=False)
    for i in range(len(y)) :
      print(models[i])
      print(classification_report(test_labels, y[i], labels=[0,1]))
    a,f,cm,y_pred=classifier_class.MajorityVotingClassifier(2)  
    print("VC")
    print(classification_report(test_labels, y_pred, labels=[0,1]))   
   elif classifier == 'LR':
      accuracies,f1_scores,models,y=classifier_class.classify(svm=False,random_forest=False,xgboost=False,logistic_regression=True,ann=False)
      print(classification_report(test_labels, y[0], labels=[0,1]))
   elif classifier == 'RF':  
      accuracies,f1_scores,models,y=classifier_class.classify(svm=False,random_forest=True,xgboost=False,logistic_regression=False,ann=False)
      print(classification_report(test_labels, y[0], labels=[0,1]))
   elif classifier == 'XGBOOST':
      accuracies,f1_scores,models,y=classifier_class.classify(svm=False,random_forest=False,xgboost=True,logistic_regression=False,ann=False)
      print(classification_report(test_labels, y[0], labels=[0,1]))
   elif classifier == 'VC': 
      a,f,cm,y_pred=classifier_class.MajorityVotingClassifier(2)  
      print(classification_report(test_labels, y_pred, labels=[0,1])) 

## Proposed Approach for Codemix Dataset

<center>
<figure>
<img src="https://docs.google.com/uc?export=download&id=12VQfzJqg-2BuV78XVRG1tMVNdQ5BQZAW" alt="neural network with activations" >

</figure>
</center>

In [14]:
import pandas as pd
train_df = pd.read_pickle("p1_codemix_flat.pkl")
test_df  = pd.read_pickle("p1_codemix_flat_test.pkl")

In [15]:
model_pipeline=model(train_df,model_name='muril')
sentences=train_df.text
text_embeddings_train=model_pipeline.GetFeatures(sentences)

{'default': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_2')>, 'pooled_output': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_2')>, 'sequence_output': <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, 'encoder_outputs': [<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_2')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by

In [16]:
model_pipeline=model(test_df,model_name='muril')
sentences=test_df.text
text_embeddings_test=model_pipeline.GetFeatures(sentences)

{'pooled_output': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_3')>, 'default': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_3')>, 'encoder_outputs': [<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_3')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer

In [12]:
text_embeddings_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,label
0,0.124043,0.134500,0.077004,-0.006100,0.030966,0.003861,0.003580,-0.038835,0.083688,-0.141268,0.000900,0.162258,-0.018547,0.027238,0.064985,0.078726,-0.045628,0.002016,0.122310,0.173132,0.012728,0.065381,0.102238,0.108588,-0.129151,0.068079,-0.093908,0.042122,0.104435,-0.025672,0.121862,0.003795,0.009220,0.114219,0.068306,0.031336,-0.173430,-0.089982,0.067608,0.175083,...,0.102404,0.046486,-0.061268,0.112938,0.081822,-0.026468,-0.090627,0.091159,0.061857,0.046192,-0.160477,-0.029153,18.301804,-0.094302,0.031746,0.035544,0.211877,0.047897,0.005340,0.009197,-0.182982,0.017418,0.189941,0.187655,-0.262543,-0.085892,0.065533,0.061839,0.103139,0.060257,-0.046887,0.121142,-0.056982,0.151524,0.010863,0.239621,-0.035025,0.072145,-0.026578,0
1,0.103912,0.129216,0.079946,-0.006812,0.013627,0.005165,0.017293,-0.031988,0.076895,-0.128542,0.022182,0.144193,-0.025026,0.032681,0.061776,0.064545,-0.051740,-0.005357,0.097701,0.136607,0.001346,0.044541,0.085434,0.108654,-0.109453,0.048841,-0.094196,0.040239,0.088144,-0.016620,0.109496,-0.001447,0.008799,0.111598,0.071740,0.010741,-0.168244,-0.086326,0.069905,0.156258,...,0.086189,0.038601,-0.057336,0.103024,0.083780,-0.023895,-0.095780,0.070471,0.063626,0.021082,-0.138694,-0.030977,18.382227,-0.069506,0.040675,0.041080,0.199880,0.051277,0.013676,0.004652,-0.168276,0.016467,0.168069,0.160629,-0.243432,-0.057015,0.080053,0.083991,0.106630,0.056991,-0.052893,0.102562,-0.063866,0.136391,-0.016734,0.213089,-0.037850,0.075687,-0.030219,1
2,0.097483,0.138531,0.075124,-0.008615,0.015715,-0.010813,0.005962,-0.033391,0.071632,-0.129085,0.019851,0.118769,-0.018736,0.034177,0.060296,0.053425,-0.053422,-0.017230,0.098766,0.124551,-0.001812,0.040865,0.074529,0.102140,-0.099850,0.051899,-0.087086,0.028740,0.084635,-0.002010,0.118879,0.001099,-0.002183,0.110045,0.067384,0.019513,-0.164688,-0.088658,0.063892,0.151088,...,0.068392,0.036431,-0.047228,0.095719,0.077383,-0.024014,-0.076258,0.065612,0.067915,0.026717,-0.113725,-0.032501,18.427412,-0.062404,0.040231,0.040680,0.176982,0.040472,-0.003573,0.009970,-0.167311,0.024985,0.153942,0.146650,-0.216273,-0.058870,0.076591,0.068274,0.090050,0.064449,-0.045407,0.100736,-0.046089,0.123311,-0.023094,0.208430,-0.030262,0.078311,-0.021009,0
3,0.093685,0.135121,0.076026,-0.014441,0.017810,-0.004321,-0.001132,-0.032551,0.072353,-0.117568,0.027541,0.118370,-0.030415,0.034471,0.053387,0.045935,-0.048423,-0.012203,0.095227,0.132745,-0.008005,0.037492,0.073282,0.113403,-0.098933,0.050052,-0.082886,0.029319,0.081387,-0.001721,0.120756,0.007371,0.004575,0.102982,0.068699,0.015648,-0.159536,-0.091485,0.066567,0.148538,...,0.063682,0.029673,-0.051477,0.085224,0.068106,-0.024956,-0.064720,0.060227,0.065952,0.033485,-0.097498,-0.026536,18.446928,-0.061242,0.044071,0.037866,0.177447,0.035713,-0.003016,0.008047,-0.157669,0.039303,0.147216,0.141184,-0.202188,-0.051448,0.075030,0.060876,0.089672,0.075908,-0.044726,0.092703,-0.043816,0.117879,-0.027002,0.203763,-0.028239,0.073115,-0.024267,0
4,0.081732,0.137634,0.074013,-0.013349,0.026619,-0.000204,0.004555,-0.035932,0.072349,-0.099782,0.025857,0.112873,-0.036032,0.026509,0.052716,0.044559,-0.045269,-0.018861,0.084008,0.135195,0.001938,0.031022,0.066383,0.107007,-0.088364,0.045159,-0.089889,0.024115,0.071011,-0.002410,0.108812,0.010995,0.006311,0.108382,0.059545,0.007473,-0.158940,-0.090274,0.069083,0.144610,...,0.065125,0.027264,-0.051614,0.080170,0.064870,-0.014066,-0.060917,0.057635,0.066322,0.045959,-0.086471,-0.020951,18.460798,-0.060596,0.046888,0.038498,0.164577,0.021758,0.000996,0.009869,-0.151841,0.037901,0.140723,0.135336,-0.194800,-0.055494,0.060161,0.064224,0.089300,0.082228,-0.038420,0.094584,-0.036309,0.106170,-0.019433,0.190627,-0.026108,0.069979,-0.020451,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2814,0.096928,0.087999,0.083502,-0.017315,0.057610,-0.027918,0.008292,-0.048910,0.078285,-0.126205,-0.008840,0.124814,-0.056382,0.007186,0.056894,0.026536,-0.044364,-0.008206,0.063060,0.143834,0.007958,0.027951,0.057646,0.079330,-0.068050,0.035910,-0.086067,0.010796,0.089748,-0.008609,0.099890,0.036903,0.044890,0.102589,0.098602,0.049017,-0.159808,-0.090061,0.062344,0.121671,...,0.063737,0.010577,-0.050661,0.058276,0.092074,-0.051997,-0.019162,0.074732,0.078621,0.049824,-0.066644,-0.067002,18.480732,-0.076648,0.063609,0.072378,0.198154,0.010227,-0.017589,-0.036938,-0.122935,0.065028,0.125806,0.106075,-0.235589,-0.079053,0.018025,0.048028,0.100394,0.075246,-0.013941,0.102912,-0.039320,0.136144,0.025000,0.175661,-0.046162,0.071201,0.000617,0
2815,0.096928,0.087999,0.083502,-0.017315,0.057610,-0.027918,0.008292,-0.048910,0.078285,-0.126205,-0.008840,0.124814,-0.056382,0.007186,0.056894,0.026536,-0.044364,-0.008206,0.063060,0.143834,0.007958,0.027951,0.057646,0.079330,-0.068050,0.035910,-0.086067,0.010796,0.089748,-0.008609,0.099890,0.036903,0.044890,0.102589,0.098602,0.049017,-0.159808,-0.090061,0.062344,0.121671,...,0.063737,0.010577,-0.050661,0.058276,0.092074,-0.051997,-0.019162,0.074732,0.078621,0.049824,-0.066644,-0.067002,18.480732,-0.076648,0.063609,0.072378,0.198154,0.010227,-0.017589,-0.036938,-0.122935,0.065028,0.125806,0.106075,-0.235589,-0.079053,0.018025,0.048028,0.100394,0.075246,-0.013941,0.102912,-0.039320,0.136144,0.025000,0.175661,-0.046162,0.071201,0.000617,1
2816,0.096928,0.087999,0.083502,-0.017315,0.057610,-0.027918,0.008292,-0.048910,0.078285,-0.126205,-0.008840,0.124814,-0.056382,0.007186,0.056894,0.026536,-0.044364,-0.008206,0.063060,0.143834,0.007958,0.027951,0.057646,0.079330,-0.068050,0.035910,-0.086067,0.010796,0.089748,-0.008609,0.099890,0.036903,0.044890,0.102589,0.098602,0.049017,-0.159808,-0.090061,0.062344,0.121671,...,0.063737,0.010577,-0.050661,0.058276,0.092074,-0.051997,-0.019162,0.074732,0.078621,0.049824,-0.066644,-0.067002,18.480732,-0.076648,0.063609,0.072378,0.198154,0.010227,-0.017589,-0.036938,-0.122935,0.065028,0.125806,0.106075,-0.235589,-0.079053,0.018025,0.048028,0.100394,0.075246,-0.013941,0.102912,-0.039320,0.136144,0.025000,0.175661,-0.046162,0.071201,0.000617,1
2817,0.096928,0.087999,0.083502,-0.017315,0.057610,-0.027918,0.008292,-0.048910,0.078285,-0.126205,-0.008840,0.124814,-0.056382,0.007186,0.056894,0.026536,-0.044364,-0.008206,0.063060,0.143834,0.007958,0.027951,0.057646,0.079330,-0.068050,0.035910,-0.086067,0.010796,0.089748,-0.008609,0.099890,0.036903,0.044890,0.102589,0.098602,0.049017,-0.159808,-0.090061,0.062344,0.121671,...,0.063737,0.010577,-0.050661,0.058276,0.092074,-0.051997,-0.019162,0.074732,0.078621,0.049824,-0.066644,-0.067002,18.480732,-0.076648,0.063609,0.072378,0.198154,0.010227,-0.017589,-0.036938,-0.122935,0.065028,0.125806,0.106075,-0.235589,-0.079053,0.018025,0.048028,0.100394,0.075246,-0.013941,0.102912,-0.039320,0.136144,0.025000,0.175661,-0.046162,0.071201,0.000617,1


In [17]:
LAMBDA1 = 0.4
LAMBDA2 = 0.6

In [18]:
text_embeddings_train=text_embeddings_train.apply(lambda x: x*LAMBDA2)
text_embeddings_test=text_embeddings_test.apply(lambda x: x*LAMBDA2)

In [19]:
model_pipeline=model(train_df,model_name='muril')
sentences=train_df.context
context_embeddings_train=model_pipeline.GetFeatures(sentences)

{'pooled_output': <KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_4')>, 'encoder_outputs': [<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_4')>,

In [20]:
model_pipeline=model(test_df,model_name='muril')
sentences=test_df.context
context_embeddings_test=model_pipeline.GetFeatures(sentences)

{'sequence_output': <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, 'encoder_outputs': [<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_layer_5')>, <KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'keras_laye

In [21]:
context_embeddings_train=context_embeddings_train.apply(lambda x: x*LAMBDA1)
context_embeddings_test=context_embeddings_test.apply(lambda x: x*LAMBDA1)

In [22]:
c = text_embeddings_train.columns.intersection(context_embeddings_train.columns)
text_embeddings_train[c] = text_embeddings_train[c].add(context_embeddings_train[c], fill_value=0)

In [23]:
c = text_embeddings_test.columns.intersection(context_embeddings_test.columns)
text_embeddings_test[c] = text_embeddings_test[c].add(context_embeddings_test[c], fill_value=0)

In [24]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_labels=le.fit_transform(train_df.label)
test_labels=le.fit_transform(test_df.label)
text_embeddings_train['label']=train_labels
text_embeddings_test['label']=test_labels

In [25]:
text_embeddings_train.to_pickle('muril_p1_train.pkl')
text_embeddings_test.to_pickle('muril_p1_test.pkl')

In [31]:
text_embeddings_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.009320,0.018788,-0.006944,0.014851,-0.007412,-0.008032,-0.002020,-0.003956,-0.002895,-0.009430,...,0.004398,0.018517,0.007334,-0.006915,-0.007013,0.010595,-0.020595,-0.010204,-0.011525,0
1,0.009693,0.019082,-0.007050,0.014757,-0.007401,-0.008406,-0.001513,-0.004865,-0.003080,-0.009609,...,0.004467,0.018359,0.007848,-0.006932,-0.007210,0.010530,-0.020342,-0.009935,-0.011653,0
2,0.009817,0.018897,-0.007150,0.014665,-0.007768,-0.008434,-0.001923,-0.004281,-0.002980,-0.009764,...,0.004326,0.017714,0.008157,-0.006632,-0.007226,0.010198,-0.020446,-0.010112,-0.011219,0
3,0.009555,0.019307,-0.007532,0.014876,-0.008311,-0.008466,-0.001562,-0.004689,-0.003314,-0.009309,...,0.004344,0.017759,0.008120,-0.006646,-0.007538,0.010343,-0.020388,-0.009483,-0.010941,0
4,0.008976,0.018704,-0.007940,0.014907,-0.008260,-0.008548,-0.001805,-0.004777,-0.003248,-0.009455,...,0.003704,0.017948,0.008766,-0.006480,-0.007337,0.010101,-0.020240,-0.009327,-0.010839,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824,0.009882,0.018428,-0.007269,0.015056,-0.008370,-0.007507,-0.000907,-0.005144,-0.003452,-0.009447,...,0.004132,0.018240,0.008273,-0.006302,-0.007345,0.010546,-0.020918,-0.009462,-0.010791,0
825,0.009678,0.018656,-0.007296,0.014917,-0.007714,-0.007868,-0.001052,-0.004730,-0.003112,-0.009143,...,0.004286,0.018168,0.007424,-0.006519,-0.007601,0.010523,-0.020473,-0.009347,-0.010476,0
826,0.009442,0.018250,-0.007730,0.015203,-0.008223,-0.008089,-0.001090,-0.004674,-0.003427,-0.009694,...,0.004464,0.017973,0.007826,-0.006306,-0.007334,0.010439,-0.020612,-0.010367,-0.011484,0
827,0.009734,0.018620,-0.007899,0.015370,-0.008010,-0.008377,-0.001115,-0.004773,-0.003074,-0.010235,...,0.003950,0.017673,0.009020,-0.005998,-0.006740,0.010329,-0.020431,-0.010188,-0.011081,0


In [32]:
classify(text_embeddings_train,text_embeddings_test,'RF')

0.5910735826296744
0.5472186016237108
              precision    recall  f1-score   support

           0       0.63      0.76      0.69       495
           1       0.49      0.35      0.41       334

    accuracy                           0.59       829
   macro avg       0.56      0.55      0.55       829
weighted avg       0.57      0.59      0.57       829



In [33]:
classify(text_embeddings_train,text_embeddings_test,'VC')



0.5428226779252111
0.5285991853024359
0.5910735826296744
0.5472186016237108
0.4028950542822678
0.2871883061049011
              precision    recall  f1-score   support

           0       0.63      0.51      0.56       495
           1       0.43      0.54      0.48       334

    accuracy                           0.53       829
   macro avg       0.53      0.53      0.52       829
weighted avg       0.55      0.53      0.53       829



In [34]:
classify(text_embeddings_train,text_embeddings_test,'XGBOOST')



0.5428226779252111
0.5285991853024359
              precision    recall  f1-score   support

           0       0.62      0.60      0.61       495
           1       0.44      0.46      0.45       334

    accuracy                           0.54       829
   macro avg       0.53      0.53      0.53       829
weighted avg       0.55      0.54      0.54       829



In [35]:
classify(text_embeddings_train,text_embeddings_test,'LR')

0.4028950542822678
0.2871883061049011
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       495
           1       0.40      1.00      0.57       334

    accuracy                           0.40       829
   macro avg       0.20      0.50      0.29       829
weighted avg       0.16      0.40      0.23       829



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
text_embeddings.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
9,0.054789,0.03235,0.045132,0.011432,0.027945,0.038859,0.066295,-0.041368,-0.185318,-0.056669,-0.292449,-0.205675,0.019871,-0.026477,0.017495,0.053846,0.338711,-0.089767,0.260465,-0.034204,0.109621,-0.301455,0.0197,0.038465,0.116193,-0.027735,0.197362,-0.117103,0.179585,0.27538,0.131396,0.005902,-0.157576,-0.012792,-0.001288,0.039859,0.195096,0.005216,-0.025014,0.021636,...,-0.110827,0.164867,0.039663,-0.072552,0.029156,-0.025753,-0.138112,-0.007269,-0.050354,-0.057618,-0.159216,-0.077627,0.072693,-0.167441,-0.162593,0.153871,0.085087,-0.116403,0.187941,0.067932,0.080626,0.055101,-0.026296,-0.563438,0.051438,-0.010499,0.203566,-0.078566,-0.028023,-0.024542,-0.031975,-0.098373,0.031791,-0.100841,0.008252,-0.062264,0.118103,0.123684,0.062878,0.021144
1,0.030615,0.000425,0.009215,0.017589,0.025521,0.036795,0.020033,0.0155,-0.011112,-0.016256,-0.742299,-0.048801,-0.010824,0.006499,-0.008644,0.032625,-0.014337,-0.020743,0.043686,0.026987,0.007414,-0.031631,-0.015576,-0.012913,0.018424,0.048199,0.094767,-0.047641,0.074676,-0.076122,-0.035171,-0.003875,0.325535,0.050646,-0.015413,0.020753,0.008974,0.012997,-0.024538,0.002755,...,0.013028,-0.031336,-0.02236,0.020344,0.021743,-0.011524,0.502648,-0.059547,-0.018335,-0.013033,-0.010071,0.006326,0.093462,-0.051765,-0.054987,0.001253,0.00024,-0.025298,-0.049617,0.018624,-0.007285,0.0382,0.017888,-0.512702,0.02341,-0.01516,0.05404,-0.013286,-0.028545,-0.036693,-0.066163,0.066957,0.037755,-0.006277,0.002566,-0.024979,-0.015172,0.064008,0.01943,-0.011093
5,0.058386,0.038379,0.046785,0.022875,0.033437,0.031639,0.060722,-0.054776,-0.171845,-0.05659,-0.402166,-0.189891,0.026178,-0.014966,0.02566,0.052457,0.30628,-0.082665,0.234964,-0.009989,0.105262,-0.298171,0.024785,0.04352,0.110536,-0.040086,0.199149,-0.108274,0.163617,0.244227,0.137974,0.001222,-0.064172,-0.00521,-0.009182,0.059573,0.19636,-5.5e-05,-0.025231,0.024885,...,-0.114002,0.186371,0.039573,-0.070455,0.025634,-0.024492,-0.101712,-0.006715,-0.040781,-0.048724,-0.135694,-0.080228,0.086642,-0.153589,-0.144842,0.120273,0.103267,-0.119322,0.210446,0.066195,0.075881,0.062632,-0.009801,-0.545824,0.046806,-0.000202,0.160817,-0.086686,-0.033568,-0.015791,-0.030575,-0.081538,0.049051,-0.113465,0.003869,-0.030815,0.121997,0.136461,0.061591,0.027793
2,0.026898,-0.002393,0.01564,-0.009956,0.024871,0.066524,0.015866,0.015512,-0.014693,-0.009622,-0.677379,-0.068384,-0.018065,0.01891,-0.027899,0.030876,0.008662,-0.026182,0.054194,0.022646,0.018135,-0.042962,-0.026303,-0.013244,0.026777,0.04214,0.129166,-0.059768,0.079539,-0.055233,-0.039829,-0.002597,0.282113,0.046398,-0.011717,0.044923,-0.003875,0.015572,-0.023935,0.004102,...,-0.007906,-0.054381,-0.017028,0.022343,0.016226,-0.006639,0.480743,-0.065336,-0.029165,-0.018925,-0.025419,0.010185,0.101086,-0.062012,-0.060424,0.030442,-0.016882,-0.025692,-0.037573,0.018329,-0.014027,0.016671,0.013768,-0.540666,0.030673,-0.018268,0.106783,-0.018245,-0.023604,-0.025987,-0.076718,0.055182,0.035663,0.004863,0.009669,-0.046479,-0.014747,0.061432,0.025644,-0.016558
3,0.042112,0.04114,0.045888,0.008132,0.040857,0.042683,0.055735,-0.054126,-0.171168,-0.065786,-0.392952,-0.190238,0.037667,-0.026607,0.001633,0.049261,0.316525,-0.088052,0.23584,-0.03,0.102268,-0.303481,0.021353,0.045782,0.104935,-0.064716,0.200895,-0.111766,0.161379,0.244836,0.14665,0.009169,-0.105845,0.005706,-0.007549,0.05929,0.182715,-0.002263,-0.027198,0.025549,...,-0.111764,0.188012,0.040718,-0.062839,0.031462,-0.022642,-0.134687,-0.006754,-0.044974,-0.049168,-0.146804,-0.075206,0.075917,-0.162001,-0.138051,0.150444,0.096129,-0.119541,0.215922,0.066686,0.069825,0.045278,-0.034506,-0.557004,0.053655,-0.00838,0.206,-0.086752,-0.025194,-0.015104,-0.017672,-0.089062,0.057492,-0.108781,0.004007,-0.046486,0.11824,0.132683,0.053824,0.029758
0,0.044679,0.005139,0.023171,0.00028,0.058296,0.008275,-0.003685,0.012268,-0.01361,-0.009217,-0.78958,-0.060901,-0.010747,0.003784,-0.008705,0.031506,0.003667,-0.013011,0.043982,0.0145,0.016973,-0.037468,-0.010291,-0.019118,0.01712,0.034042,0.110823,-0.052322,0.064933,-0.062549,-0.043773,-0.014179,0.352271,0.021404,-0.013019,0.027321,0.041756,0.00504,-0.023326,0.009231,...,0.024719,-0.01137,-0.018519,0.019588,0.016314,-0.004371,0.501307,-0.053468,-0.026467,-0.010316,-0.00162,0.006701,0.062119,-0.036003,-0.053956,0.017058,-0.005807,-0.011,-0.033628,0.01695,-0.01179,0.000746,0.011845,-0.519149,0.031052,-0.033941,0.073028,-0.015569,-0.022644,-0.062854,-0.093992,0.061201,0.011064,-0.008204,0.003134,-0.01372,-0.029201,0.047148,0.023302,-0.011958
8,0.042661,0.032416,0.043848,0.014519,0.029697,0.029468,0.062236,-0.041971,-0.176675,-0.0672,-0.294354,-0.192183,0.025911,-0.044151,0.015569,0.04484,0.328591,-0.093737,0.254722,-0.023572,0.108023,-0.303909,0.018566,0.04658,0.114638,-0.032281,0.194326,-0.118068,0.166667,0.275669,0.131843,0.006226,-0.204879,-0.011505,-0.005706,0.035417,0.184435,0.00405,-0.026407,0.021223,...,-0.105877,0.176662,0.036209,-0.067053,0.019317,-0.023238,-0.134961,-0.018132,-0.048401,-0.053966,-0.172296,-0.076202,0.074106,-0.161361,-0.155599,0.167263,0.077978,-0.117817,0.182707,0.067821,0.080071,0.056296,-0.032632,-0.543811,0.054311,-0.01549,0.188578,-0.078505,-0.02986,-0.02082,-0.026671,-0.095451,0.036665,-0.103524,0.009047,-0.066344,0.118753,0.120908,0.053179,0.019612
7,0.042125,0.032824,0.041815,0.032828,0.018703,0.035636,0.055852,-0.034822,-0.167416,-0.061652,-0.321887,-0.191876,0.023302,-0.040002,0.019919,0.04619,0.322125,-0.098029,0.247625,0.000201,0.107681,-0.297889,0.019881,0.054533,0.113876,-0.025065,0.177331,-0.121781,0.161498,0.229722,0.137471,0.006727,-0.120268,-0.000193,-0.004895,0.042666,0.178823,0.009277,-0.024975,0.013973,...,-0.111317,0.186475,0.035657,-0.060701,0.024149,-0.015697,-0.136402,-0.0165,-0.044313,-0.048686,-0.16679,-0.077813,0.072708,-0.158453,-0.14945,0.149597,0.072237,-0.116016,0.19619,0.075337,0.074213,0.072819,-0.022753,-0.519965,0.051669,-0.008504,0.18628,-0.085965,-0.032715,-0.01088,-0.03044,-0.090603,0.035355,-0.10404,0.007078,-0.057798,0.120996,0.129047,0.049651,0.018413
6,0.064131,0.035566,0.041097,0.027898,0.016829,0.027581,0.056745,-0.046991,-0.160387,-0.062424,-0.382482,-0.193301,0.023788,-0.023374,0.022188,0.05038,0.315568,-0.087106,0.246113,0.007014,0.106431,-0.300071,0.025347,0.047757,0.106411,-0.03042,0.174448,-0.110407,0.176249,0.233292,0.147166,0.003939,-0.093552,-0.017154,-0.009639,0.057054,0.189015,0.002751,-0.025019,0.021346,...,-0.107411,0.188995,0.04093,-0.067507,0.031807,-0.02332,-0.097167,-0.006114,-0.038423,-0.046701,-0.14208,-0.076388,0.076405,-0.154208,-0.142076,0.118969,0.095604,-0.115537,0.208653,0.068406,0.073743,0.079628,-0.000887,-0.543876,0.048384,-0.008137,0.182747,-0.082661,-0.033104,-0.011738,-0.035078,-0.082021,0.037162,-0.108933,0.005075,-0.039884,0.115923,0.129636,0.062387,0.027629
4,0.030185,0.040556,0.047657,0.017882,0.036454,0.040781,0.054377,-0.059566,-0.174687,-0.070598,-0.380887,-0.18416,0.037108,-0.027798,0.015255,0.045514,0.311646,-0.085453,0.237503,-0.030873,0.102513,-0.300265,0.023769,0.047939,0.114979,-0.062029,0.210021,-0.109475,0.164836,0.24003,0.138143,0.003937,-0.089233,0.009076,-0.009075,0.051915,0.175018,-0.001471,-0.024623,0.020314,...,-0.107438,0.180413,0.037187,-0.068141,0.038305,-0.021564,-0.125241,-0.007987,-0.042105,-0.047862,-0.149507,-0.075414,0.080551,-0.162144,-0.141398,0.143754,0.097401,-0.12174,0.206579,0.066215,0.071276,0.050355,-0.035673,-0.546413,0.053983,-0.008446,0.180187,-0.091834,-0.024001,-0.011985,-0.012244,-0.087697,0.049721,-0.110793,0.001434,-0.047632,0.123288,0.135607,0.051256,0.027934


In [None]:
features = df[['num_bad_words', 'dale_chall', 'num_words', 'total_length',
       'num_unique_words', 'words_vs_unique', 'capitals', 'caps_vs_length',
       'emoji', 'num_urls', 'exclamations', 'question_marks',
       'encoded_labels_label', 'Profane_Score', 'Bad_word_count',
       'Sentiment_label', 'Sentiment_Score']]
features = features.head(10)

In [None]:
final_df = pd.concat([text_embeddings,features], axis=1)

In [None]:
final_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,num_bad_words,dale_chall,num_words,total_length,num_unique_words,words_vs_unique,capitals,caps_vs_length,emoji,num_urls,exclamations,question_marks,encoded_labels_label,Profane_Score,Bad_word_count,Sentiment_label,Sentiment_Score
9,0.054789,0.03235,0.045132,0.011432,0.027945,0.038859,0.066295,-0.041368,-0.185318,-0.056669,-0.292449,-0.205675,0.019871,-0.026477,0.017495,0.053846,0.338711,-0.089767,0.260465,-0.034204,0.109621,-0.301455,0.0197,0.038465,0.116193,-0.027735,0.197362,-0.117103,0.179585,0.27538,0.131396,0.005902,-0.157576,-0.012792,-0.001288,0.039859,0.195096,0.005216,-0.025014,0.021636,...,-0.116403,0.187941,0.067932,0.080626,0.055101,-0.026296,-0.563438,0.051438,-0.010499,0.203566,-0.078566,-0.028023,-0.024542,-0.031975,-0.098373,0.031791,-0.100841,0.008252,-0.062264,0.118103,0.123684,0.062878,0.021144,0,11.85,71,427,63,0.887324,18,0.042155,0,0,1,0,1,0,0,-1,0.460804
8,0.042661,0.032416,0.043848,0.014519,0.029697,0.029468,0.062236,-0.041971,-0.176675,-0.0672,-0.294354,-0.192183,0.025911,-0.044151,0.015569,0.04484,0.328591,-0.093737,0.254722,-0.023572,0.108023,-0.303909,0.018566,0.04658,0.114638,-0.032281,0.194326,-0.118068,0.166667,0.275669,0.131843,0.006226,-0.204879,-0.011505,-0.005706,0.035417,0.184435,0.00405,-0.026407,0.021223,...,-0.117817,0.182707,0.067821,0.080071,0.056296,-0.032632,-0.543811,0.054311,-0.01549,0.188578,-0.078505,-0.02986,-0.02082,-0.026671,-0.095451,0.036665,-0.103524,0.009047,-0.066344,0.118753,0.120908,0.053179,0.019612,1,8.15,75,427,64,0.853333,17,0.039813,0,0,1,0,0,0,0,-1,0.548178
4,0.030185,0.040556,0.047657,0.017882,0.036454,0.040781,0.054377,-0.059566,-0.174687,-0.070598,-0.380887,-0.18416,0.037108,-0.027798,0.015255,0.045514,0.311646,-0.085453,0.237503,-0.030873,0.102513,-0.300265,0.023769,0.047939,0.114979,-0.062029,0.210021,-0.109475,0.164836,0.24003,0.138143,0.003937,-0.089233,0.009076,-0.009075,0.051915,0.175018,-0.001471,-0.024623,0.020314,...,-0.12174,0.206579,0.066215,0.071276,0.050355,-0.035673,-0.546413,0.053983,-0.008446,0.180187,-0.091834,-0.024001,-0.011985,-0.012244,-0.087697,0.049721,-0.110793,0.001434,-0.047632,0.123288,0.135607,0.051256,0.027934,0,8.58,20,136,14,0.7,6,0.044118,0,0,0,0,1,0,0,0,0.565855
3,0.042112,0.04114,0.045888,0.008132,0.040857,0.042683,0.055735,-0.054126,-0.171168,-0.065786,-0.392952,-0.190238,0.037667,-0.026607,0.001633,0.049261,0.316525,-0.088052,0.23584,-0.03,0.102268,-0.303481,0.021353,0.045782,0.104935,-0.064716,0.200895,-0.111766,0.161379,0.244836,0.14665,0.009169,-0.105845,0.005706,-0.007549,0.05929,0.182715,-0.002263,-0.027198,0.025549,...,-0.119541,0.215922,0.066686,0.069825,0.045278,-0.034506,-0.557004,0.053655,-0.00838,0.206,-0.086752,-0.025194,-0.015104,-0.017672,-0.089062,0.057492,-0.108781,0.004007,-0.046486,0.11824,0.132683,0.053824,0.029758,0,6.86,24,154,18,0.75,6,0.038961,0,0,0,0,0,0,0,0,0.412697
2,0.026898,-0.002393,0.01564,-0.009956,0.024871,0.066524,0.015866,0.015512,-0.014693,-0.009622,-0.677379,-0.068384,-0.018065,0.01891,-0.027899,0.030876,0.008662,-0.026182,0.054194,0.022646,0.018135,-0.042962,-0.026303,-0.013244,0.026777,0.04214,0.129166,-0.059768,0.079539,-0.055233,-0.039829,-0.002597,0.282113,0.046398,-0.011717,0.044923,-0.003875,0.015572,-0.023935,0.004102,...,-0.025692,-0.037573,0.018329,-0.014027,0.016671,0.013768,-0.540666,0.030673,-0.018268,0.106783,-0.018245,-0.023604,-0.025987,-0.076718,0.055182,0.035663,0.004863,0.009669,-0.046479,-0.014747,0.061432,0.025644,-0.016558,0,7.32,26,168,22,0.846154,8,0.047619,0,0,0,0,1,0,0,-1,0.396173
0,0.044679,0.005139,0.023171,0.00028,0.058296,0.008275,-0.003685,0.012268,-0.01361,-0.009217,-0.78958,-0.060901,-0.010747,0.003784,-0.008705,0.031506,0.003667,-0.013011,0.043982,0.0145,0.016973,-0.037468,-0.010291,-0.019118,0.01712,0.034042,0.110823,-0.052322,0.064933,-0.062549,-0.043773,-0.014179,0.352271,0.021404,-0.013019,0.027321,0.041756,0.00504,-0.023326,0.009231,...,-0.011,-0.033628,0.01695,-0.01179,0.000746,0.011845,-0.519149,0.031052,-0.033941,0.073028,-0.015569,-0.022644,-0.062854,-0.093992,0.061201,0.011064,-0.008204,0.003134,-0.01372,-0.029201,0.047148,0.023302,-0.011958,0,13.01,7,59,7,1.0,7,0.118644,0,0,0,0,0,0,0,1,0.623605
6,0.064131,0.035566,0.041097,0.027898,0.016829,0.027581,0.056745,-0.046991,-0.160387,-0.062424,-0.382482,-0.193301,0.023788,-0.023374,0.022188,0.05038,0.315568,-0.087106,0.246113,0.007014,0.106431,-0.300071,0.025347,0.047757,0.106411,-0.03042,0.174448,-0.110407,0.176249,0.233292,0.147166,0.003939,-0.093552,-0.017154,-0.009639,0.057054,0.189015,0.002751,-0.025019,0.021346,...,-0.115537,0.208653,0.068406,0.073743,0.079628,-0.000887,-0.543876,0.048384,-0.008137,0.182747,-0.082661,-0.033104,-0.011738,-0.035078,-0.082021,0.037162,-0.108933,0.005075,-0.039884,0.115923,0.129636,0.062387,0.027629,1,11.91,34,245,31,0.911765,10,0.040816,1,0,0,1,0,0,0,-1,0.659131
1,0.030615,0.000425,0.009215,0.017589,0.025521,0.036795,0.020033,0.0155,-0.011112,-0.016256,-0.742299,-0.048801,-0.010824,0.006499,-0.008644,0.032625,-0.014337,-0.020743,0.043686,0.026987,0.007414,-0.031631,-0.015576,-0.012913,0.018424,0.048199,0.094767,-0.047641,0.074676,-0.076122,-0.035171,-0.003875,0.325535,0.050646,-0.015413,0.020753,0.008974,0.012997,-0.024538,0.002755,...,-0.025298,-0.049617,0.018624,-0.007285,0.0382,0.017888,-0.512702,0.02341,-0.01516,0.05404,-0.013286,-0.028545,-0.036693,-0.066163,0.066957,0.037755,-0.006277,0.002566,-0.024979,-0.015172,0.064008,0.01943,-0.011093,0,8.87,10,74,10,1.0,4,0.054054,0,0,0,0,1,0,0,0,0.49492
7,0.042125,0.032824,0.041815,0.032828,0.018703,0.035636,0.055852,-0.034822,-0.167416,-0.061652,-0.321887,-0.191876,0.023302,-0.040002,0.019919,0.04619,0.322125,-0.098029,0.247625,0.000201,0.107681,-0.297889,0.019881,0.054533,0.113876,-0.025065,0.177331,-0.121781,0.161498,0.229722,0.137471,0.006727,-0.120268,-0.000193,-0.004895,0.042666,0.178823,0.009277,-0.024975,0.013973,...,-0.116016,0.19619,0.075337,0.074213,0.072819,-0.022753,-0.519965,0.051669,-0.008504,0.18628,-0.085965,-0.032715,-0.01088,-0.03044,-0.090603,0.035355,-0.10404,0.007078,-0.057798,0.120996,0.129047,0.049651,0.018413,1,9.51,47,299,41,0.87234,11,0.036789,1,0,0,0,1,0,0,-1,0.77744
5,0.058386,0.038379,0.046785,0.022875,0.033437,0.031639,0.060722,-0.054776,-0.171845,-0.05659,-0.402166,-0.189891,0.026178,-0.014966,0.02566,0.052457,0.30628,-0.082665,0.234964,-0.009989,0.105262,-0.298171,0.024785,0.04352,0.110536,-0.040086,0.199149,-0.108274,0.163617,0.244227,0.137974,0.001222,-0.064172,-0.00521,-0.009182,0.059573,0.19636,-5.5e-05,-0.025231,0.024885,...,-0.119322,0.210446,0.066195,0.075881,0.062632,-0.009801,-0.545824,0.046806,-0.000202,0.160817,-0.086686,-0.033568,-0.015791,-0.030575,-0.081538,0.049051,-0.113465,0.003869,-0.030815,0.121997,0.136461,0.061591,0.027793,1,12.78,24,161,20,0.833333,8,0.049689,0,0,0,1,1,0,0,-1,0.379103


In [None]:
final_df.to_pickle("{}:{}_features_codemix.pkl".format(LAMBDA1*100, LAMBDA2*100))