In [191]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
# load data
data = pd.read_csv("../dataset/train.csv")

In [138]:
# peep at data
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [161]:
def flags_for_nan(df, imputer="none", dtype="object"):
    """
    Imputes missing values and creates a binary flag for columns with missing values
    
    Parameters
    ----------
    df: pandas datafrme
    imputer: value to fill missing value
    dtype: data type of column
    
    Returns
    -------
    Pandas dataframe with values filled and flag columns with boolean features
    
    True = 1
    False = 0
    """
    x = data.copy() 
    for col in x.columns:
        if x[col].isna().any() and x[col].dtypes==dtype:
            x[col+"_missing"] = x[col].isnull().astype("int")
            x[col] = x[col].fillna(value="none")
    return x

In [175]:
# create df with flags
new_df = flags_for_nan(data)
new_df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_missing,location_missing
0,1,none,none,Our Deeds are the Reason of this #earthquake M...,1,1,1
1,4,none,none,Forest fire near La Ronge Sask. Canada,1,1,1
2,5,none,none,All residents asked to 'shelter in place' are ...,1,1,1
3,6,none,none,"13,000 people receive #wildfires evacuation or...",1,1,1
4,7,none,none,Just got sent this photo from Ruby #Alaska as ...,1,1,1


In [176]:
new_df.head(100)

Unnamed: 0,id,keyword,location,text,target,keyword_missing,location_missing
0,1,none,none,Our Deeds are the Reason of this #earthquake M...,1,1,1
1,4,none,none,Forest fire near La Ronge Sask. Canada,1,1,1
2,5,none,none,All residents asked to 'shelter in place' are ...,1,1,1
3,6,none,none,"13,000 people receive #wildfires evacuation or...",1,1,1
4,7,none,none,Just got sent this photo from Ruby #Alaska as ...,1,1,1
...,...,...,...,...,...,...,...
95,137,accident,Charlotte,9 Mile backup on I-77 South...accident blockin...,1,0,0
96,138,accident,"Baton Rouge, LA",Has an accident changed your life? We will hel...,0,0,0
97,139,accident,"Hagerstown, MD",#BREAKING: there was a deadly motorcycle car a...,1,0,0
98,141,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0,0,0


In [186]:
def smoothed_target_encoding(df, feature, target, weight=0, test=None):
    
    x = df.copy()
    
    mean = x[target].mean()
    
    agg = x.groupby(feature)[target].agg(["count", "mean"])
    counts = agg["count"]
    means = agg["mean"]
    
    smooth = (counts * means + weight * mean)/(counts + weight)
    
    if test == None:
        return x[feature].map(smooth)
    else:
        return x[feature].map(smooth), test[feature].map(smooth)

In [190]:
new_df["keyword_enc"] = smoothed_target_encoding(new_df, "keyword", "target")
new_df.head()

Unnamed: 0,id,keyword,location,text,target,keyword_missing,location_missing,keyword_enc
0,1,none,none,Our Deeds are the Reason of this #earthquake M...,1,1,1,0.688525
1,4,none,none,Forest fire near La Ronge Sask. Canada,1,1,1,0.688525
2,5,none,none,All residents asked to 'shelter in place' are ...,1,1,1,0.688525
3,6,none,none,"13,000 people receive #wildfires evacuation or...",1,1,1,0.688525
4,7,none,none,Just got sent this photo from Ruby #Alaska as ...,1,1,1,0.688525


In [None]:
tfidf = TfidfVectorizer()