In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from sklearn import preprocessing
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def train_test_classifier(X, Y, model = GradientBoostingClassifier(), random_state = None, test_size = 0.2):
    if test_size > 0:
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, random_state=random_state)
    else:
        X_train, X_test, y_train, y_test = X, X.iloc[0], Y, Y.iloc[0]

    #Definition and training of the model
    gbc=model
    X_resampled, y_resampled = X_train, y_train

    gbc.fit(X_resampled, y_resampled)

    if test_size > 0:
        score = gbc.score(X_test, y_test)
    else:    
        score = -1
    return gbc, score, X_train, X_test, y_train, y_test

In [3]:
def sp_loc(df, index, columns, val):
    """ Insert data in a DataFrame with SparseDtype format

    Only applicable for pandas version > 0.25

    Args
    ----
    df : DataFrame with series formatted with pd.SparseDtype
    index: str, or list, or slice object
        Same as one would use as first argument of .loc[]
    columns: str, list, or slice
        Same one would normally use as second argument of .loc[]
    val: insert values

    Returns
    -------
    df: DataFrame
        Modified DataFrame

    """

    # Save the original sparse format for reuse later
    spdtypes = df.dtypes[columns]

    # Convert concerned Series to dense format
    df[columns] = df[columns].sparse.to_dense()

    # Ensures the order of the columns is the same
    df = df.reindex(sorted(df.columns), axis=1)
    val = val.reindex(sorted(val.columns), axis=1)
    val_list = val.values.tolist()
    
    # Do a normal insertion with .loc[]
    df.loc[index, columns] = val_list

    # Back to the original sparse format
    df[columns] = df[columns].astype(spdtypes)

    return df

def one_hot_encoding(df, cat_cols):
    df_cat = df[cat_cols]
    #df_cat = df_cat.dropna()
    df_cat[pd.isnull(df_cat)]  = 'NaN'
    
    num_cols = [c for c in df.columns if c not in cat_cols]

    # print(df_cat)
    # le = preprocessing.LabelEncoder()
    # X_2 = df_cat.apply(le.fit_transform)
    # print(X_2)
    enc = preprocessing.OneHotEncoder()
    enc.fit(df_cat)
    onehotlabels = enc.transform(df_cat)
    # transformed_df = pd.DataFrame(onehotlabels, columns=enc.get_feature_names_out())
    transformed_df = pd.DataFrame.sparse.from_spmatrix(onehotlabels, columns=enc.get_feature_names_out(), index=df_cat.index)

    info = {'cat_cols': cat_cols}
    # Replace nans for distribution
    for cat in cat_cols:
        oh_name = [x for x in transformed_df.columns if cat in x and 'NaN' not in x]
        # nan_df = pd.DataFrame(columns=oh_name)
        counts = df[cat].dropna().groupby(df[cat].dropna()).count()
        percentage = counts/len(counts)
        percentage_df = pd.DataFrame(percentage).transpose()
        percentage_df = percentage_df.add_prefix(cat + '_')
        transformed_df = sp_loc(transformed_df, transformed_df[cat + '_NaN'] == 1.0, oh_name, percentage_df)
        transformed_df = transformed_df.drop([cat + '_NaN'], axis = 1)
        info[cat] =  percentage_df.reset_index().drop('index', axis=1)
        #info[cat].index.name = 'index'
    return pd.concat([df[num_cols],transformed_df], axis = 1), info


In [4]:
df_orig = pd.read_csv("../data/train.csv", index_col='id')
df_orig.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
cat_cols = ['keyword']
ignore_cols = ['location', 'target', 'text']
target_col = 'target'

df = df_orig.copy()
df['keyword'] = df_orig['keyword'].str.replace('%20','_')
df, info = one_hot_encoding(df,cat_cols)

data_cols = [x for x in df.columns if x not in ignore_cols]
joblib.dump(info, "../data/preproc_info.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[pd.isnull(df_cat)]  = 'NaN'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[pd.isnull(df_cat)]  = 'NaN'


['../data/preproc_info.pkl']

In [6]:
# sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
sbert_model = SentenceTransformer('all-mpnet-base-v2')

In [7]:
# sentence_embeddings = sbert_model.encode(df['text'].values.tolist())
# joblib.dump(sentence_embeddings, "../data/embedings_all.pkl")

In [8]:
# sentences = ["I ate dinner.", 
#        "We had a three-course meal.", 
#        "Brad came to dinner with us.",
#        "He loves fish tacos.",
#        "In the end, we all felt like we ate too much.",
#        "We all agreed; it was a magnificent evening."]



# print("Original Sentence = ", query)
# for sent in sentences:
#   sim = 1 - cosine(query_vec, sbert_model.encode([sent])[0])
#   print("Sentence = ", sent, "; similarity = ", sim)
sentence_embeddings = joblib.load("../data/embedings_all.pkl")

In [10]:
X = df[data_cols].to_numpy()
y = df[target_col].to_numpy()
X = np.concatenate((X,sentence_embeddings), axis = 1)

joblib.dump(data_cols,"../data/data_cols.pkl")
joblib.dump(target_col,"../data/target_col.pkl")

In [11]:
# gbc, score, X_train, X_test, y_train, y_test = train_test_classifier(X, y)
# joblib.dump(gbc, "../data/trained_model.pkl")
# joblib.dump(X_train, "../data/train_dataset.pkl")
# joblib.dump(X_test, "../data/test_dataset.pkl")
# joblib.dump(y_train, "../data/train_target.pkl")
# joblib.dump(y_test, "../data/test_target.pkl")

['../data/preproc_info.pkl']

In [None]:
# Load the trained model
gbc = joblib.load("../data/trained_model.pkl")

# Load the train and test datasets
X_train = joblib.load("../data/train_dataset.pkl")
X_test = joblib.load("../data/test_dataset.pkl")

# Load the train and test targets
y_train = joblib.load("../data/train_target.pkl")
y_test = joblib.load("../data/test_target.pkl")

In [13]:
y_pred = gbc.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[760,  96],
       [173, 494]], dtype=int64)

In [14]:
from tpot import TPOTClassifier

In [15]:
pipeline_optimizer = TPOTClassifier()
pipeline_optimizer = TPOTClassifier(generations=20, population_size=100, cv=5,
                                    random_state=0, verbosity=2, n_jobs=-1)
pipeline_optimizer.fit(X_train, y_train)

                                                                     
                                                                                
TPOT closed during evaluation in one generation.
                                                                                
                                                                                
TPOT closed prematurely. Will use the current best pipeline.
                                                                                

RuntimeError: A pipeline has not yet been optimized. Please call fit() first.