In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from sklearn import preprocessing
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
def sp_loc(df, index, columns, val):
    """ Insert data in a DataFrame with SparseDtype format

    Only applicable for pandas version > 0.25

    Args
    ----
    df : DataFrame with series formatted with pd.SparseDtype
    index: str, or list, or slice object
        Same as one would use as first argument of .loc[]
    columns: str, list, or slice
        Same one would normally use as second argument of .loc[]
    val: insert values

    Returns
    -------
    df: DataFrame
        Modified DataFrame

    """

    # Save the original sparse format for reuse later
    spdtypes = df.dtypes[columns]

    # Convert concerned Series to dense format
    df[columns] = df[columns].sparse.to_dense()

    # Ensures the order of the columns is the same
    df = df.reindex(sorted(df.columns), axis=1)
    val = val.reindex(sorted(val.columns), axis=1)
    val_list = val.values.tolist()
    
    # Do a normal insertion with .loc[]
    df.loc[index, columns] = val_list

    # Back to the original sparse format
    df[columns] = df[columns].astype(spdtypes)

    return df

def one_hot_encoding_predict(df, info):
    cat_cols = info['cat_cols']
    df_cat = df[cat_cols]
    df_cat[pd.isnull(df_cat)]  = 'NaN'
    
    num_cols = [c for c in df.columns if c not in cat_cols]

    ln_df = len(df_cat)
    for cat in cat_cols:
        single_cat_cols = info[cat].columns
        zr = np.zeros((ln_df,len(single_cat_cols)))
        oh_df = pd.DataFrame(zr, columns = single_cat_cols, index=df_cat.index)
        df_cat = pd.concat([df_cat, oh_df], axis=1)

        unq = [x for x in df_cat[cat].unique() if x != 'NaN']
        for u in unq:
            df_cat.loc[df_cat[cat]==u, cat + '_' + u] = 1

        df_cat.loc[df_cat[cat]=='NaN',single_cat_cols] = info[cat].values.tolist()
        df_cat = df_cat.drop(cat, axis=1)
        return df_cat

In [24]:
df_test = pd.read_csv("../data/test.csv", index_col='id')
df_test.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [27]:
# Load the preprocessing information
info = joblib.load("../data/preproc_info.pkl")
data_cols = joblib.load("../data/data_cols.pkl")
target_col = joblib.load("../data/target_col.pkl")
gbc = joblib.load("../data/trained_model.pkl")

In [32]:
df = df_test.copy()
df['keyword'] = df_test['keyword'].str.replace('%20','_')
df = one_hot_encoding_predict(df, info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[pd.isnull(df_cat)]  = 'NaN'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[pd.isnull(df_cat)]  = 'NaN'


In [30]:
sbert_model = SentenceTransformer('all-mpnet-base-v2')
# sentence_embeddings = sbert_model.encode(df['text'].values.tolist())
# joblib.dump(sentence_embeddings, "../data/embedings_test.pkl")
sentence_embeddings = joblib.load("../data/embedings_test.pkl")

['../data/embedings_test.pkl']

In [34]:
X = df[data_cols].to_numpy()
X = np.concatenate((X,sentence_embeddings), axis = 1)
y_pred = gbc.predict(X)

In [41]:
subm = pd.DataFrame(y_pred, columns = ['target'], index= df.index)
subm.to_csv("../data/submission.csv")

In [40]:
#score 0.8224

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1
