In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.nn.functional.normalize(torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9))

In [290]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stops = stopwords.words('english')
from sklearn.svm import LinearSVC,SVC
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import cross_validate,StratifiedKFold
from collections import Counter
from imblearn.over_sampling import SMOTE,ADASYN,SMOTENC
import warnings
warnings.filterwarnings("ignore")
from sklearn.naive_bayes import GaussianNB
def inference_sbert(sentences):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def model(df_train,df_test,embeddings_=False,tfidf_=False,model=SVC(**{'C':4.238})):
    
    y_train = df_train['Domain']
    embeddings = np.array(list(df_train['embeddings'].values))
    smote=SMOTE(k_neighbors=1,random_state = 101)
    embeddings, y_train = smote.fit_resample(embeddings, y_train)
    embeddings_test = np.array(list(df_test['embeddings'].values))
    
    if embeddings_ == True and tfidf_==True:
        x_train = df_train['Title_lowered']
        x_test = df_test['Title_lowered']
        tfidf = TfidfVectorizer(stop_words=stops,ngram_range=(1,3),max_features=900)
        x_train_f = tfidf.fit_transform(x_train.values)
        x_test_f = tfidf.fit_transform(x_test.values)
        x_to_train = np.hstack((embeddings,x_train_f.todense()))
        x_to_test = np.hstack((embeddings_test,x_test_f.todense()))
    if embeddings_ == False and tfidf_ == True:
        x_train = df_train['Title_lowered']
        x_test = df_test['Title_lowered']
        tfidf = TfidfVectorizer(stop_words=stops,ngram_range=(1,3),max_features=900)
        x_train_f = tfidf.fit_transform(x_train.values)
        x_test_f = tfidf.fit_transform(x_test.values)
        x_to_train=x_train_f
        x_to_test=x_test_f
    if embeddings_ == True and tfidf_ == False:
        x_to_train = embeddings
        x_to_test = embeddings_test
    model.fit(x_to_train, y_train)
    y_preds = model.predict(x_to_test)
    return y_preds,model

In [291]:
df_train = pd.read_csv('data/train.csv').sample(frac=1).drop_duplicates('Title')
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())

sentences_train = df_train['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_train)
df_train['embeddings'] = sentences_embeddings.numpy().tolist()
#sentences_test = df_test['Title'].map(str).values.tolist()
#sentences_embeddings = inference_sbert(sentences_test)
#df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [292]:
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())

In [293]:
df_train = df_train.reset_index()

In [294]:
df = df_train

In [295]:
df_train, df_test = train_test_split(df_train,test_size=0.1)

In [296]:
df_train = df

In [297]:
train_indexes= []
test_indexes = []
best_models = []
maximums = []
means = []
for i in range(40):
    mean = []
    maximum = 0
    y_preds_class = None
    df_train['Domain_{}'.format(i)] = None
    skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=np.random.randint(9,999))
    for train_index, test_index in tqdm(skf.split(df_train['embeddings'], df_train['Domain'])):
        dftrain, dftest = df_train.iloc[train_index], df_train.iloc[test_index]
        dftest_,modell = model(dftrain,dftest,embeddings_=True,tfidf_=False,model=GaussianNB())

        if not isinstance(y_preds_class,type(dftest_)):
            ypro_test = dftest['Domain'].values
            y_preds_class= dftest_
        else:
            ypro_test = np.concatenate((ypro_test,dftest['Domain'].values))
            y_preds_class = np.concatenate((y_preds_class,dftest_))
        df_train.loc[test_index,'Domain_{}'.format(i)] = dftest_
        f1_macro = classification_report(dftest['Domain'].values,dftest_,output_dict='csv')['macro avg']['f1-score']
        mean.append(f1_macro)
        if f1_macro>maximum:
            maximum = f1_macro
            ypro_test_ = dftest['Domain'].values
            train_indexes_ =train_index
            test_indexes_ = test_index
            best_modell = modell
    means.append(np.mean(mean))
    print(np.mean(means),np.std(means))
    best_models.append(best_modell)
    train_indexes.append(train_indexes_)
    test_indexes.append(test_indexes_)

6it [00:02,  2.07it/s]


0.4829542106007931 0.0


6it [00:02,  2.06it/s]


0.48099777720929865 0.0019564333914944276


6it [00:02,  2.52it/s]


0.48143913160259877 0.0017150340554073335


6it [00:02,  2.17it/s]


0.4794727047166948 0.0037157112981531792


6it [00:02,  2.79it/s]


0.481674943334904 0.0055176650937789114


6it [00:02,  2.31it/s]


0.4820700340709101 0.005113805515886498


6it [00:02,  2.15it/s]


0.48323657395542907 0.0055299222910731635


6it [00:02,  2.25it/s]


0.48379849704460426 0.00538217784769154


6it [00:02,  2.41it/s]


0.48401948748721807 0.0051127178565457095


6it [00:02,  2.31it/s]


0.48354753455532756 0.005052776624971466


6it [00:02,  2.30it/s]


0.48382596027028446 0.00489742801590791


6it [00:02,  2.29it/s]


0.48375620641909123 0.00469463410048134


6it [00:02,  2.39it/s]


0.48323424979658947 0.004859372480138725


6it [00:02,  2.52it/s]


0.4827203233444095 0.005035909792899095


6it [00:02,  2.55it/s]


0.4831484082969812 0.0051220395467748816


6it [00:02,  2.89it/s]


0.48305999848807574 0.004971199851491167


6it [00:02,  2.64it/s]


0.4829638247118525 0.004838090793343752


6it [00:02,  2.48it/s]


0.4827267961499088 0.004802273419416587


6it [00:03,  1.88it/s]


0.4830815798264145 0.004910573866347949


6it [00:02,  2.40it/s]


0.48299454876003567 0.004801245946189585


6it [00:02,  2.19it/s]


0.48284217496606857 0.004734828992803614


6it [00:02,  2.25it/s]


0.4829361696875404 0.004645978177432783


6it [00:02,  2.09it/s]


0.4830500771397792 0.00457515879618839


6it [00:02,  2.11it/s]


0.4828340704443519 0.004597071091471188


6it [00:02,  2.57it/s]


0.4826455756855223 0.00459787632432501


6it [00:02,  2.92it/s]


0.4824960566521391 0.004570149926922116


6it [00:03,  1.86it/s]


0.4823922806713113 0.004515828893456392


6it [00:02,  2.21it/s]


0.48234791874139066 0.004440443106786485


6it [00:02,  2.06it/s]


0.4822469339021765 0.004395811897951201


6it [00:02,  2.76it/s]


0.48210666476660846 0.004387441659497521


6it [00:02,  2.16it/s]


0.48197500730651915 0.004375922605710374


6it [00:03,  1.90it/s]


0.4816853176395759 0.0045991113288834165


6it [00:02,  2.03it/s]


0.4817488264602726 0.004543118804396037


6it [00:02,  2.21it/s]


0.4816090070914517 0.0045473073126568835


6it [00:03,  1.72it/s]


0.4815972117969347 0.004482402711980064


6it [00:02,  2.26it/s]


0.48146002575835556 0.004493609363742525


6it [00:02,  2.29it/s]


0.48139465597617853 0.00444978838767692


6it [00:02,  2.68it/s]


0.4811570703637973 0.0046225624921137735


6it [00:03,  1.96it/s]


0.48127376523665977 0.004619270153531952


6it [00:02,  2.28it/s]

0.48128395953403497 0.004561608088907728





In [299]:
def inference(model,dftest):
    embeddings_test = np.array(list(dftest['embeddings'].values))
    preds = model.predict_proba(embeddings_test)
    return preds

y_preds_class = None
liste_ = []
for index,model in tqdm(enumerate(best_models)):
    dftrain, dftest = _, df_train
    preds = inference(model,dftest)
    if not isinstance(y_preds_class,type(preds)):
        ypro_test = dftest['Domain'].values
        y_preds_class= preds
    else:
        ypro_test = np.concatenate((ypro_test,dftest['Domain'].values))
        y_preds_class = np.concatenate((y_preds_class,preds))
    liste_.append(preds)

40it [00:04,  8.21it/s]


In [300]:
from sklearn.preprocessing import OneHotEncoder

In [317]:
features = []
for i in tqdm(range(3835)):
    listesqg = []
    for j in liste_:
        listesqg+=list(j[i])
    features.append(listesqg)

100%|██████████| 3835/3835 [00:00<00:00, 14337.26it/s]


In [321]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
clf = XGBClassifier(scale_pos_weight=5)

In [322]:
df_train['features'] = features

In [325]:
clf.fit(features,df_train['Domain'])

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [327]:
df_test = pd.read_csv('data/test_.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [330]:
def inference(model,dftest):
    embeddings_test = np.array(list(dftest['embeddings'].values))
    preds = model.predict_proba(embeddings_test)
    return preds

y_preds_class = []
for index,model in tqdm(enumerate(best_models)):
    dftest = df_test
    preds = inference(model,dftest)
    y_preds_class.append(preds)

40it [00:02, 18.60it/s]


In [331]:
len(y_preds_class[0])

1649

In [332]:
features = []
for i in tqdm(range(len(y_preds_class[0]))):
    listesqg = []
    for j in y_preds_class:
        listesqg+=list(j[i])
    features.append(listesqg)

100%|██████████| 1649/1649 [00:00<00:00, 13612.78it/s]


In [334]:
preds = clf.predict(features)

In [339]:
df_test['labels_preds'] = preds

In [340]:
df_test = df_test[['ID','labels_preds']].reset_index()[['ID','labels_preds']]

In [341]:
df_sample = pd.read_csv('data/sample_submission_.csv')[['ID']]

In [342]:
df_sample = df_sample.merge(df_test, on='ID', how='left')

In [343]:
df_sample = df_sample.merge(df_train, on='ID', how='left')#.to_csv('sample_submission_1.csv',columns=['ID','Domain'])
df_sample = df_sample[['ID','labels_preds']]
df_sample.columns= ['ID','Domain']

In [344]:
df_sample['Domain'] = df_sample['Domain'].map(str)
df_sample['Domain'] = df_sample['Domain'].astype(str)

In [345]:
df_sample.to_csv('sample_submission_23.csv',columns=['ID','Domain'],index_label=False)