## 0. Loading the libraries and data

In [125]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import f1_score, roc_auc_score, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [126]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [127]:
train_path = '/content/gdrive/MyDrive/Colab Notebooks/Work/train.csv'
test_path = '/content/gdrive/MyDrive/Colab Notebooks/Work/test.csv'
sample_sub_path = '/content/gdrive/MyDrive/Colab Notebooks/Work/sample_submission.csv'

train_df = pd.read_csv(train_path, index_col='id')
test_df = pd.read_csv(test_path, index_col='id')
sample_sub = pd.read_csv(sample_sub_path, index_col='id')

## 1. Exploratory data analysis

In [128]:
train_df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [129]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 297.4+ KB


- There are missing values in keyword and location columns
- The target class is slightly imbalanced

In [130]:
train_df['target'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.57034
1,0.42966


In [131]:
print('Unique keywords count:', train_df['keyword'].nunique())
print('Unique locations count:', train_df['location'].nunique())

Unique keywords count: 221
Unique locations count: 3341


- By the distribution of the target class for tweets with and without hashtags it is visible that tweets that contain hashtags are slighlty more likely to be about real disaster
- Additionally, tweets without links that start with https are also more likely to be "True"

In [132]:
print(train_df[train_df['text'].str.contains('#')]['target'].value_counts(normalize=True))
print(train_df[~train_df['text'].str.contains('#')]['target'].value_counts(normalize=True))

target
0    0.503123
1    0.496877
Name: proportion, dtype: float64
target
0    0.590567
1    0.409433
Name: proportion, dtype: float64


In [133]:
print(train_df[train_df['text'].str.contains('https')]['target'].value_counts(normalize=True))
print(train_df[~train_df['text'].str.contains('https')]['target'].value_counts(normalize=True))

target
0    0.670762
1    0.329238
Name: proportion, dtype: float64
target
0    0.564668
1    0.435332
Name: proportion, dtype: float64


- Top 5 and bottom 5 keywords by their "likelihood" of indicating a real disaster tweet are displayed below. It is obvious that this feature might be very useful for predicting real/fake disaster tweet

In [134]:
train_df.groupby(["keyword"])['target'].agg(["mean", "count"]).sort_values(by="mean", ascending=False).head(5)

Unnamed: 0_level_0,mean,count
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
derailment,1.0,39
debris,1.0,37
wreckage,1.0,39
outbreak,0.975,40
typhoon,0.973684,38


In [135]:
train_df.groupby(["keyword"])['target'].agg(["mean", "count"]).sort_values(by="mean", ascending=False).tail(5)

Unnamed: 0_level_0,mean,count
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
body%20bag,0.030303,33
blazing,0.029412,34
ruin,0.027027,37
body%20bags,0.02439,41
aftershock,0.0,34


- Below are displayed some of the locations, their count and a mean target value for each group

In [136]:
train_df.groupby(["location"])['target'].agg(["mean", "count"]).sort_values(by=["count", "mean"], ascending=False) #.iloc[30:40]

Unnamed: 0_level_0,mean,count
location,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,0.644231,104
New York,0.225352,71
United States,0.540000,50
London,0.355556,45
Canada,0.448276,29
...,...,...
"ÌøåÀå_T: 40.736324,-73.990062",0.000000,1
å_: ?? ÌÑ ? : ?,0.000000,1
å_å_Los Mina Cityã¢,0.000000,1
å¡å¡Midwest Û¢Û¢,0.000000,1


## 2. Data preprocessing and feature engineering

In [138]:
def preprocess_data(train, val, drop_cols, target):

  X_train = train.drop(columns=drop_cols + target)
  y_train = train[target].values.ravel()

  X_val = val.drop(columns=drop_cols + target)
  y_val = val[target].values.ravel()

  return X_train, y_train, X_val, y_val

In [139]:

train, val = train_test_split(train_df, test_size=0.2, shuffle=True, stratify=train_df['target'])

In [140]:
drop_cols = ['keyword', 'location']
target = ['target']

X_train, y_train, X_val, y_val = preprocess_data(train, val, drop_cols, target)

## 3. Modeling and evaluation

- As a baseline model, a dummy classifier that generates predictions at random will be used

In [142]:
dummy_clf = DummyClassifier(strategy="uniform", random_state=0)
dummy_clf.fit(X_train, y_train)

print('Train F1 score:', f1_score(y_train, dummy_clf.predict(X_train)))
print('Validation F1 score:', f1_score(y_val, dummy_clf.predict(X_val)))

Train F1 score: 0.4695164681149264
Validation F1 score: 0.47865640307907625


- Next, the logistic regression model with the tf-idf vectorizer will be tuned

In [143]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(encoding='unicode')),
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (3, 3)],
    'tfidf__max_features': [1000, 3000, None],
    'clf__C': [0.1, 1.0, 10.0],
    #'clf__max_iter': [250, 500, 750],
    'clf__class_weight': ['balanced', None],
}

lr_search = GridSearchCV(pipeline, parameters, cv=3, verbose=1, scoring='f1')
lr_search.fit(X_train['text'], y_train)

print('Best score: %.3f' % lr_search.best_score_)
print('Config: %s' % lr_search.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best score: 0.749
Config: {'clf__C': 10.0, 'clf__class_weight': 'balanced', 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 1)}


In [145]:
print('Train F1 score:', f1_score(y_train, lr_search.predict(X_train['text'])))
print('Validation F1 score:', f1_score(y_val, lr_search.predict(X_val['text'])))

Train F1 score: 0.9796779141104295
Validation F1 score: 0.7553846153846154


## 4. Submission file

In [35]:
def make_prediction(model, filename, X_test):
  preds = model.predict(X_test)

  test_sub = sample_sub.copy()
  test_sub['target'] = preds
  test_sub.to_csv(filename, index_label='id')

In [36]:
make_prediction(lr_search, 'submission_2.csv', test_df['text'])