In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.nn.functional.normalize(torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9))

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stops = stopwords.words('english')
from sklearn.svm import LinearSVC,SVC
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import cross_validate,StratifiedKFold
from collections import Counter

def inference_sbert(sentences):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
    model = AutoModel.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def model(df_train,df_test,embeddings_=False,tfidf_=False,model=SVC(**{'C':4.238})):
    x_train = df_train['Title_lowered']
    x_test = df_test['Title_lowered']
    y_train = df_train['Domain']
    
    embeddings = np.array(list(df_train['embeddings'].values))
    embeddings_test = np.array(list(df_test['embeddings'].values))
    tfidf = TfidfVectorizer(stop_words=stops,ngram_range=(1,3),max_features=900)
    x_train_f = tfidf.fit_transform(x_train.values)
    x_test_f = tfidf.fit_transform(x_test.values)
    if embeddings_ == True and tfidf_==True:
        x_to_train = np.hstack((embeddings,x_train_f.todense()))
        x_to_test = np.hstack((embeddings_test,x_test_f.todense()))
    if embeddings_ == False and tfidf_ == True:
        x_to_train=x_train_f
        x_to_test=x_test_f
    if embeddings_ == True and tfidf_ == False:
        x_to_train = embeddings
        x_to_test = embeddings_test
    model.fit(x_to_train, y_train)
    y_preds = model.predict(x_to_test)
    return y_preds,model

In [3]:
import pickle
with open('df_train.pkl','rb') as f:
    df_train = pickle.load(f)

In [4]:
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())
print('ok1')
sentences_train = df_train['Title'].map(str).values.tolist()
print('ok2')
sentences_embeddings = inference_sbert(sentences_train)
print('ok3')
df_train['embeddings'] = sentences_embeddings.numpy().tolist()
print('ok4')
#sentences_test = df_test['Title'].map(str).values.tolist()
#sentences_embeddings = inference_sbert(sentences_test)
#df_test['embeddings'] = sentences_embeddings.numpy().tolist()

ok1
ok2
ok3
ok4


In [5]:
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())

In [6]:
df_train = df_train.reset_index()

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
train_indexes= []
test_indexes = []
best_models = []
for i in range(20):
    maximum = 0
    y_preds_class = None
    df_train['Domain_{}'.format(i)] = None
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=np.random.randint(9,999))
    for train_index, test_index in tqdm(skf.split(df_train['Title_lowered'], df_train['Domain'])):
        dftrain, dftest = df_train.iloc[train_index], df_train.iloc[test_index]
        dftest_,modell = model(dftrain,dftest,embeddings_=True,tfidf_=False,model=SVC(**{'C':4.238}))

        if not isinstance(y_preds_class,type(dftest_)):
            ypro_test = dftest['Domain'].values
            y_preds_class= dftest_
        else:
            ypro_test = np.concatenate((ypro_test,dftest['Domain'].values))
            y_preds_class = np.concatenate((y_preds_class,dftest_))
        df_train.loc[test_index,'Domain_{}'.format(i)] = dftest_
        f1_macro = classification_report(dftest['Domain'].values,dftest_,output_dict=True)['macro avg']['f1-score']
        
        if f1_macro>maximum:
            maximum = f1_macro
            ypro_test_ = dftest['Domain'].values
            train_indexes_ =train_index
            test_indexes_ = test_index
            best_modell = modell
    print(maximum)
    best_models.append(best_modell)
    train_indexes.append(train_indexes_)
    test_indexes.append(test_indexes_)

10it [01:40, 10.05s/it]


0.6517928768060973


10it [01:39,  9.91s/it]


0.668692123091317


10it [01:00,  6.08s/it]


0.6738282297588241


10it [00:51,  5.13s/it]


0.6675385706354623


0it [00:05, ?it/s]


KeyboardInterrupt: 

In [10]:
df_test = pd.read_csv('data/test.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [11]:
def inference(model,dftest):
    embeddings_test = np.array(list(dftest['embeddings'].values))
    preds = model.predict(embeddings_test)
    return preds

df_test = pd.read_csv('data/test.csv')
y_preds_class = None
for index,model in tqdm(enumerate(best_models)):
    dftrain, dftest = df_train.iloc[train_indexes[index]], df_train.iloc[test_indexes[index]]
    preds = inference(model,dftest)
    if not isinstance(y_preds_class,type(preds)):
        ypro_test = dftest['Domain'].values
        y_preds_class= preds
    else:
        ypro_test = np.concatenate((ypro_test,dftest['Domain'].values))
        y_preds_class = np.concatenate((y_preds_class,preds))

4it [00:02,  1.61it/s]


In [13]:
print(classification_report(ypro_test,y_preds_class))

              precision    recall  f1-score   support

      Career       0.79      0.82      0.81       179
  Hackathons       0.84      0.75      0.79       135
        Misc       0.82      0.24      0.37        38
       Other       0.79      0.46      0.58        74
   Resources       0.65      0.46      0.54        70
  Techniques       0.77      0.86      0.81       670
       Tools       0.75      0.77      0.76       368

    accuracy                           0.77      1534
   macro avg       0.77      0.62      0.67      1534
weighted avg       0.77      0.77      0.76      1534



In [100]:
print(classification_report(ypro_test,y_preds_class))

              precision    recall  f1-score   support

      Career       0.80      0.84      0.82       788
  Hackathons       0.77      0.60      0.67       472
        Misc       0.76      0.24      0.36       134
       Other       0.26      0.05      0.08       220
   Resources       0.48      0.36      0.41       306
  Techniques       0.74      0.85      0.79      3330
       Tools       0.69      0.65      0.67      1651

    accuracy                           0.73      6901
   macro avg       0.64      0.51      0.54      6901
weighted avg       0.71      0.73      0.71      6901



In [14]:
df_test = pd.read_csv('data/test.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [15]:
def inference(model,dftest):
    embeddings_test = np.array(list(dftest['embeddings'].values))
    preds = model.predict(embeddings_test)
    return preds


y_preds_class = []
for index,model in tqdm(enumerate(best_models)):
    dftest = df_test
    preds = inference(model,dftest)
    y_preds_class.append(preds)

4it [00:19,  4.84s/it]


In [24]:
y_preds_class = np.array(y_preds_class)
preds = []
for i in range(len(y_preds_class[0])):
    count = Counter(list(y_preds_class[:,i]))
    preds.append(count.most_common(1)[0][0])

In [25]:
df_test['labels_preds'] = preds

In [26]:
df_test = df_test[['ID','labels_preds']].reset_index()[['ID','labels_preds']]

In [27]:
df_sample = pd.read_csv('data/sample_submission.csv')[['ID']]

In [28]:
df_sample = df_sample.merge(df_test, on='ID', how='left')

In [29]:
df_sample = df_sample.merge(df_train, on='ID', how='left')#.to_csv('sample_submission_1.csv',columns=['ID','Domain'])
df_sample = df_sample[['ID','labels_preds']]
df_sample.columns= ['ID','Domain']

In [30]:
df_sample['Domain'] = df_sample['Domain'].map(str)
df_sample['Domain'] = df_sample['Domain'].astype(str)

In [31]:
df_sample.to_csv('sample_submission_14.csv',columns=['ID','Domain'],index_label=False)