In [1]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.nn.functional.normalize(torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9))

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stops = stopwords.words('english')
from sklearn.svm import LinearSVC,SVC
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import cross_validate,StratifiedKFold
from collections import Counter

def inference_sbert(sentences):
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
    model = AutoModel.from_pretrained('sentence-transformers/multi-qa-mpnet-base-dot-v1')
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def model(dfs ):
    global df_train
    train_index,test_index = dfs
    dftrain, df_test = df_train.iloc[train_index], df_train.iloc[test_index]
    embeddings_ = True
    tfidf_ = False
    global Domains
    model=LinearSVC(**{'C':4.238})
    model.classes_ = Domains
    x_train = dftrain['Title_lowered']
    x_test = df_test['Title_lowered']
    y_train = dftrain['Domain']
    
    embeddings = np.array(list(dftrain['embeddings'].values))
    embeddings_test = np.array(list(df_test['embeddings'].values))
    tfidf = TfidfVectorizer(stop_words=stops,ngram_range=(1,3),max_features=900)
    x_train_f = tfidf.fit_transform(x_train.values)
    x_test_f = tfidf.fit_transform(x_test.values)
    if embeddings_ == True and tfidf_==True:
        x_to_train = np.hstack((embeddings,x_train_f.todense()))
        x_to_test = np.hstack((embeddings_test,x_test_f.todense()))
    if embeddings_ == False and tfidf_ == True:
        x_to_train=x_train_f
        x_to_test=x_test_f
    if embeddings_ == True and tfidf_ == False:
        x_to_train = embeddings
        x_to_test = embeddings_test
    model.fit(x_to_train, y_train)
    y_preds = model.predict(x_to_test)
    return y_preds,model,df_test



LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/nikkokks/nltk_data'
    - '/home/nikkokks/anaconda3/envs/doctr3/nltk_data'
    - '/home/nikkokks/anaconda3/envs/doctr3/share/nltk_data'
    - '/home/nikkokks/anaconda3/envs/doctr3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
df_train = pd.read_csv('data/train.csv').sample(frac=1).drop_duplicates('Title')
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())

sentences_train = df_train['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_train)
df_train['embeddings'] = sentences_embeddings.numpy().tolist()
#sentences_test = df_test['Title'].map(str).values.tolist()
#sentences_embeddings = inference_sbert(sentences_test)
#df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [None]:
df_train['Title_lowered'] = df_train['Title'].map(lambda text : str(text).lower())

In [None]:
df_train = df_train.drop_duplicates(subset='Title_lowered')

In [None]:
df_train = df_train.reset_index()

In [None]:
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import pickle

In [None]:
Domains = np.unique(df_train['Domain'].values)

In [None]:
from multiprocessing import Pool

In [None]:
def training_crossval_models(df_train):
    best_models = []
    moyenne = 0
    for i in tqdm(range(20)):
        y_preds_class = None
        
        skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=np.random.randint(9,999))
        liste = [(i,j) for i,j in skf.split(df_train['Title_lowered'], df_train['Domain'])]
        p= Pool(31)
        liste = p.map(model,liste)
        p.close()
        
        best_models = []
        y_preds_class = []
        ypro_test = []
        for i,j,k in liste:
            y_preds_class.append(i)
            best_models.append(j)
            ypro_test.append(k['Domain'].values)
            
        y_preds_class = np.concatenate(y_preds_class)
        ypro_test = np.concatenate(ypro_test)
        f1_macro = classification_report(ypro_test,y_preds_class,output_dict='csv')['macro avg']['f1-score']
        moyenne += f1_macro
    moyenne/=20
    with open('best_models.pkl','wb') as f:
        pickle.dump(best_models,f)
    return moyenne

In [None]:
def inference_proba(df_train):
    y_preds_class = None
    with open('best_models.pkl','rb') as f:
        best_models = pickle.load(f)
    embeddings_train = np.array(list(df_train['embeddings'].values))
    liste_predictions= []
    labels = []
    for model in best_models:
        preds = model._predict_proba_lr(embeddings_train)
        enc = OneHotEncoder()
        enc.fit(np.array(Domains).reshape(-1,1))
        y_preds_class =  enc.transform(df_train['Domain'].values.reshape(-1,1))
        labels.append( y_preds_class)
        liste_predictions.append( preds)
    return labels, liste_predictions

In [None]:
def classe_estimation(y_preds_class,y_pro_test,df_train):
    global Domains
    
    confidence_predictions = []
    
    best_index = 0

    best_value = -np.inf
    liste = [list(Domains).index('Misc'), list(Domains).index('Other'),list(Domains).index('Resources')]
        
    for j_to_test in range(7):
    
        labels_to_test = np.zeros((y_preds_class[0].shape[0],7))
        labels_to_test[:,j_to_test] = 1
        labels_to_avoid = np.zeros((y_preds_class[0].shape[0],7))
        confidence_predictions = np.zeros((y_preds_class[0].shape[0],))
        
        for i in range(len(y_preds_class)):
            labels = y_pro_test[i]
            predictions = y_preds_class[i]
            neg_entropy = np.mean(predictions*np.log(predictions),axis=1)
            labels = np.array(labels.todense())
            predictions = np.array(predictions)
            crossentropy = np.mean(labels_to_test*np.log(predictions),axis=1)
            loss = (neg_entropy+crossentropy)/2
            indexes  = labels.argmax(axis=1)
            mask = indexes == j_to_test
            loss[mask] = -np.inf
            confidence_predictions+=loss

        confidence_predictions= np.exp(confidence_predictions/(len(y_preds_class)-1))
        value = np.max(confidence_predictions)
        #if value>best_value:
        #    best_value=value
        #    best_index = np.argmax(confidence_predictions)
        #    new_class = Domains[j_to_test]
        best_value=value
        if best_value >0.2:
            best_index = np.argmax(confidence_predictions)
            new_class = Domains[j_to_test]
            print('{} : {} :  {}'.format(best_value,new_class,df_train['Title_lowered'].values[best_index]))
            labelss = df_train['Domain'].values
            labelss[best_index] = new_class
            df_train['Domain'] = labelss
    return df_train

In [None]:
import copy 
early_stopping= 0
best_f1_macro = -np.inf

while early_stopping<20:
    moyenne = training_crossval_models(df_train)
    print(moyenne)
    if best_f1_macro< moyenne:
        print(moyenne)
        best_f1_macro = moyenne
        df_final = copy.deepcopy(df_train)
        early_stopping= 0
    else:
        early_stopping += 1
    y_pro_test,y_preds_class= inference_proba(df_train)
    df_train = classe_estimation(y_preds_class,y_pro_test,df_train)


In [17]:
import pickle
with open('df_train.pkl','wb') as f:
    pickle.dump(df_train,f)

In [None]:
dico = {'career':['certification','internship','course','data science program'],
        'Hackathons':['hackathon','hackathons'],
        'Resources':['resource','tutorial'],
        
       }
# mettre un meilleur embedding https://www.sbert.net/docs/pretrained_models.html

In [None]:
df_test = pd.read_csv('data/test.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

index                                                         2491
ID                                                            2492
Title            Need your Valuable Advice on the career shift ...
Domain                                                      Career
Title_lowered    need your valuable advice on the career shift ...
embeddings       [-0.003127791453152895, -0.009716558270156384,...
Name: 396, dtype: object

In [None]:
import pickle
with open('df_train_correct.pkl','wb') as f:
    pickle.dump(df_final,f)

In [None]:
df_test = pd.read_csv('data/test_.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [None]:
# alimenter Other et Misc par rapport a Techniques, Tools et Carrer

In [None]:
print(classification_report(ypro_test,y_preds_class))

In [18]:
df_test = pd.read_csv('data/test.csv')
df_test['Title_lowered'] = df_test['Title'].map(lambda text : str(text).lower())
sentences_test = df_test['Title'].map(str).values.tolist()
sentences_embeddings = inference_sbert(sentences_test)
df_test['embeddings'] = sentences_embeddings.numpy().tolist()

In [None]:
def inference(model,dftest):
    embeddings_test = np.array(list(dftest['embeddings'].values))
    preds = model.predict(embeddings_test)
    return preds


y_preds_class = []
for index,model in tqdm(enumerate(best_models)):
    dftest = df_test
    preds = inference(model,dftest)
    y_preds_class.append(preds)

In [None]:
y_preds_class = np.array(y_preds_class)
preds = []
for i in range(len(y_preds_class[0])):
    count = Counter(list(y_preds_class[:,i]))
    preds.append(count.most_common(1)[0][0])

In [None]:
df_test['labels_preds'] = preds

In [None]:
df_test = df_test[['ID','labels_preds']].reset_index()[['ID','labels_preds']]

In [None]:
df_sample = pd.read_csv('data/sample_submission_.csv')[['ID']]

In [None]:
df_sample = df_sample.merge(df_test, on='ID', how='left')

In [None]:
df_sample = df_sample.merge(df_train, on='ID', how='left')#.to_csv('sample_submission_1.csv',columns=['ID','Domain'])
df_sample = df_sample[['ID','labels_preds']]
df_sample.columns= ['ID','Domain']

In [None]:
df_sample['Domain'] = df_sample['Domain'].map(str)
df_sample['Domain'] = df_sample['Domain'].astype(str)

In [None]:
df_sample.to_csv('sample_submission_13.csv',columns=['ID','Domain'],index_label=False)