In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from sklearn import preprocessing
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def sp_loc(df, index, columns, val):
    """ Insert data in a DataFrame with SparseDtype format

    Only applicable for pandas version > 0.25

    Args
    ----
    df : DataFrame with series formatted with pd.SparseDtype
    index: str, or list, or slice object
        Same as one would use as first argument of .loc[]
    columns: str, list, or slice
        Same one would normally use as second argument of .loc[]
    val: insert values

    Returns
    -------
    df: DataFrame
        Modified DataFrame

    """

    # Save the original sparse format for reuse later
    spdtypes = df.dtypes[columns]

    # Convert concerned Series to dense format
    df[columns] = df[columns].sparse.to_dense()

    # Ensures the order of the columns is the same
    df = df.reindex(sorted(df.columns), axis=1)
    val = val.reindex(sorted(val.columns), axis=1)
    val_list = val.values.tolist()
    
    # Do a normal insertion with .loc[]
    df.loc[index, columns] = val_list

    # Back to the original sparse format
    df[columns] = df[columns].astype(spdtypes)

    return df

def one_hot_encoding_predict(df, info):
    cat_cols = info['cat_cols']
    df_cat = df[cat_cols]
    df_cat[pd.isnull(df_cat)]  = 'NaN'
    
    num_cols = [c for c in df.columns if c not in cat_cols]

    ln_df = len(df)
    for cat in cat_cols:
        single_cat_cols = info[cat].columns
        zr = np.zeros((ln_df,len(single_cat_cols)))
        oh_df = pd.DataFrame(zr, columns = single_cat_cols)
        print(oh_df)
    # enc = preprocessing.OneHotEncoder()
    # enc.fit(df_cat)
    # onehotlabels = enc.transform(df_cat)
    # # transformed_df = pd.DataFrame(onehotlabels, columns=enc.get_feature_names_out())
    # transformed_df = pd.DataFrame.sparse.from_spmatrix(onehotlabels, columns=enc.get_feature_names_out(), index=df_cat.index)

    # # Replace nans for distribution
    # for cat in cat_cols:
    #     oh_name = [x for x in transformed_df.columns if cat in x and 'NaN' not in x]
    #     # nan_df = pd.DataFrame(columns=oh_name)
    #     counts = df[cat].dropna().groupby(df[cat].dropna()).count()
    #     percentage = counts/len(counts)
    #     percentage_df = pd.DataFrame(percentage).transpose()
    #     percentage_df = percentage_df.add_prefix(cat + '_')
    #     print(oh_name)
    #     transformed_df = sp_loc(transformed_df, transformed_df[cat + '_NaN'] == 1.0, oh_name, percentage_df)
    #     transformed_df = transformed_df.drop([cat + '_NaN'], axis = 1)
    # return pd.concat([df[num_cols],transformed_df], axis = 1)

In [3]:
df_test = pd.read_csv("../data/test.csv", index_col='id')
df_test.head()

In [5]:
# Load the preprocessing information
info = joblib.load("../data/preproc_info.pkl")
data_cols = joblib.load("../data/data_cols.pkl")
target_col = joblib.load("../data/target_col.pkl")

In [9]:
info['keyword'].columns

Index(['keyword_ablaze', 'keyword_accident', 'keyword_aftershock',
       'keyword_airplane_accident', 'keyword_ambulance', 'keyword_annihilated',
       'keyword_annihilation', 'keyword_apocalypse', 'keyword_armageddon',
       'keyword_army',
       ...
       'keyword_weapons', 'keyword_whirlwind', 'keyword_wild_fires',
       'keyword_wildfire', 'keyword_windstorm', 'keyword_wounded',
       'keyword_wounds', 'keyword_wreck', 'keyword_wreckage',
       'keyword_wrecked'],
      dtype='object', name='keyword', length=221)