In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scripts import remove_links, find_hashtags

In [2]:
df_train = pd.read_csv('./data/train.csv')

### Preprocessing

I'd like to remove links and add hashtags and keywords in order to increase the weight of those words within each text block. This function is stored in `scripts.py`

```Python
def remove_links(text):
    '''
    This code removes links and html artifacts and is great for 
    dirty data scraped from the internet.
    '''
    
    # Getting rid of links
    text = [word for word in text.lower().split() if not 'http' in word]
    text = ' '.join(text)
    
    return text
```

In [26]:
df_train['keyword'] = df_train['keyword'].fillna('NO KEYWORD')

In [3]:
df_train['text_nolinks'] = df_train['text'].apply(remove_links)

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target,text_nolinks
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


In [89]:
df_train['text_nl_hashtag'] = (df_train['text_nolinks'].apply(find_hashtags) + ' ') * 8

In [90]:
df_train['text_nl_hashtag'] = df_train['text_nolinks'] + ' ' + df_train['text_nl_hashtag']

In [91]:
df_train.iloc[0]['text_nl_hashtag']

'our deeds are the reason of this #earthquake may allah forgive us all earthquake earthquake earthquake earthquake earthquake earthquake earthquake earthquake '

In [92]:
def remove_keyword(text):
    
    return ' '.join([word for word in text.split() if not x['keyword'] in word])

In [93]:
df_train['text_nl_ht_keyword'] = df_train.apply(lambda x: ' '.join([word for word in x['text_nl_hashtag'].split() if not x['keyword'] in word]), axis=1)

In [94]:
df_train.iloc[50]

id                                                                   73
keyword                                                          ablaze
location                                       Sheffield Township, Ohio
text                  Deputies: Man shot before Brighton home set ab...
target                                                                1
text_nolinks          deputies: man shot before brighton home set ab...
text_nl_hashtag       deputies: man shot before brighton home set ab...
text_nl_ht_keyword          deputies: man shot before brighton home set
Name: 50, dtype: object

In [95]:
df_train.to_csv('./data/train_nolinks.csv', index=False)