In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Preprocessing

## Tokenization

In [26]:
from nltk.tokenize import word_tokenize

In [27]:
def tokenize(df):
    df['tokenized_text'] =  df['text'].apply(word_tokenize)

In [28]:
tokenize(train)
tokenize(test)

In [29]:
train['tokenized_text']

0       [Our, Deeds, are, the, Reason, of, this, #, ea...
1        [Forest, fire, near, La, Ronge, Sask, ., Canada]
2       [All, residents, asked, to, 'shelter, in, plac...
3       [13,000, people, receive, #, wildfires, evacua...
4       [Just, got, sent, this, photo, from, Ruby, #, ...
                              ...                        
7608    [Two, giant, cranes, holding, a, bridge, colla...
7609    [@, aria_ahrary, @, TheTawniest, The, out, of,...
7610    [M1.94, [, 01:04, UTC, ], ?, 5km, S, of, Volca...
7611    [Police, investigating, after, an, e-bike, col...
7612    [The, Latest, :, More, Homes, Razed, by, North...
Name: tokenized_text, Length: 7613, dtype: object

## Removing punctuation 

In [30]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
def remove_punctuation(df):
    df['only_text'] = df['tokenized_text'].apply(
        lambda row: [word for word in row if word not in punctuation]
    )

In [32]:
remove_punctuation(train)
remove_punctuation(test)

In [33]:
train['only_text'] 

0       [Our, Deeds, are, the, Reason, of, this, earth...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, residents, asked, to, 'shelter, in, plac...
3       [13,000, people, receive, wildfires, evacuatio...
4       [Just, got, sent, this, photo, from, Ruby, Ala...
                              ...                        
7608    [Two, giant, cranes, holding, a, bridge, colla...
7609    [aria_ahrary, TheTawniest, The, out, of, contr...
7610    [M1.94, 01:04, UTC, 5km, S, of, Volcano, Hawai...
7611    [Police, investigating, after, an, e-bike, col...
7612    [The, Latest, More, Homes, Razed, by, Northern...
Name: only_text, Length: 7613, dtype: object

## Removing stopwords

In [34]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

In [35]:
def remove_stopwords(df):
    df['cleaned_text'] = df['only_text'].apply(
        lambda row: [word for word in row if word not in stop_words]
    ) 

In [36]:
remove_stopwords(train)
remove_stopwords(test)

In [37]:
train['cleaned_text']

0       [Our, Deeds, Reason, earthquake, May, ALLAH, F...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, residents, asked, 'shelter, place, notif...
3       [13,000, people, receive, wildfires, evacuatio...
4       [Just, got, sent, photo, Ruby, Alaska, smoke, ...
                              ...                        
7608    [Two, giant, cranes, holding, bridge, collapse...
7609    [aria_ahrary, TheTawniest, The, control, wild,...
7610    [M1.94, 01:04, UTC, 5km, S, Volcano, Hawaii, h...
7611    [Police, investigating, e-bike, collided, car,...
7612    [The, Latest, More, Homes, Razed, Northern, Ca...
Name: cleaned_text, Length: 7613, dtype: object

# Extracting more information from the tweets

In [38]:
train

Unnamed: 0,id,keyword,location,text,target,tokenized_text,only_text,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...","[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, 'shelter, place, notif..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...","[13,000, people, receive, wildfires, evacuatio...","[13,000, people, receive, wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...","[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ..."
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, bridge, collapse..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[@, aria_ahrary, @, TheTawniest, The, out, of,...","[aria_ahrary, TheTawniest, The, out, of, contr...","[aria_ahrary, TheTawniest, The, control, wild,..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[M1.94, [, 01:04, UTC, ], ?, 5km, S, of, Volca...","[M1.94, 01:04, UTC, 5km, S, of, Volcano, Hawai...","[M1.94, 01:04, UTC, 5km, S, Volcano, Hawaii, h..."
7611,10872,,,Police investigating after an e-bike collided ...,1,"[Police, investigating, after, an, e-bike, col...","[Police, investigating, after, an, e-bike, col...","[Police, investigating, e-bike, collided, car,..."


In [39]:
def count_hashtags(df):
    df['#_amount'] = df['tokenized_text'].apply(lambda row: len([word for word in row if word=='#']))

In [40]:
count_hashtags(train)
count_hashtags(test)

In [42]:
def count_mentions(df):
    df['@_amount'] = df['tokenized_text'].apply(lambda row: len([word for word in row if word=='@']))

In [43]:
count_mentions(train)
count_mentions(test)

In [44]:
def count_punctuation(df):
    df['punctuation_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if word in punctuation])
    )

In [45]:
count_punctuation(train)
count_punctuation(test)

In [46]:
def count_stopwords(df):
    df['stopword_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if word in stop_words])
    )

In [47]:
count_stopwords(train)
count_stopwords(test)

In [48]:
def count_urls(df):
    df['url_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if 'http' in word or 'https' in word])
    )

In [49]:
count_urls(train)
count_urls(test)

In [50]:
def mean_word_length(df):
    df['mean_word_length'] = df['only_text'].apply(
        lambda row: round(np.mean([len(word) for word in row]), 3)
    )

In [51]:
mean_word_length(train)
mean_word_length(test)

In [53]:
def word_amount(df):
    df['word_amount'] = df['only_text'].apply(
        lambda row: len(row)
    )

In [54]:
word_amount(train)
word_amount(test)

In [55]:
def unique_word_amount(df):
    df['unique_word_amount'] = df['only_text'].apply(
        lambda row: len(list(set(row)))
    )

In [56]:
unique_word_amount(train)
unique_word_amount(test)

In [68]:
from pandas_profiling import ProfileReport
profile = ProfileReport(train.sample(frac=0.1), title="Twitter Profiling Report", explorative=True)
profile.to_file("twitter.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Random Forest

In [79]:
y_train = train['target']
X_train = train.drop(['tokenized_text', 'only_text', 'cleaned_text',
                      'keyword', 'location', 'text', 'target', 'id'], axis=1)
X_test = test.drop(['tokenized_text', 'only_text', 'cleaned_text',
                    'keyword', 'location', 'text', 'id'], axis=1)

In [105]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=120)
rf_model.fit(X_train, y_train)
RandomForestClassifier(n_estimators=120)
y_pred = rf_model.predict(X_test)

In [107]:
indices = list(test['id'])
y_pred = pd.DataFrame ({'id':indices,
                        'target':y_pred})

In [109]:
y_pred.to_csv('prediction.csv', index=False)