## 0. Loading the libraries and data

In [237]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import f1_score, roc_auc_score, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint, rv_discrete, rv_continuous
from sklearn.decomposition import TruncatedSVD

import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer as wnl
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [238]:
#!pip install gensim

In [368]:
from gensim.models import Word2Vec, FastText
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [240]:
train_url = 'https://raw.githubusercontent.com/Dmitrovich-Ivan/datasets/refs/heads/main/disaster_train.csv'
test_url = 'https://raw.githubusercontent.com/Dmitrovich-Ivan/datasets/refs/heads/main/disaster_test.csv'
sample_url = 'https://raw.githubusercontent.com/Dmitrovich-Ivan/datasets/refs/heads/main/disaster_sample_submission.csv'

train_df = pd.read_csv(train_url, index_col='id')
test_df = pd.read_csv(test_url, index_col='id')
sample_sub = pd.read_csv(sample_url, index_col='id')

## 1. Exploratory data analysis

In [241]:
train_df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [242]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 297.4+ KB


- There are missing values in keyword and location columns
- The target class is slightly imbalanced

In [243]:
train_df['target'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.57034
1,0.42966


In [244]:
print('Unique keywords count:', train_df['keyword'].nunique())
print('Unique locations count:', train_df['location'].nunique())

Unique keywords count: 221
Unique locations count: 3341


- By the distribution of the target class for tweets with and without hashtags it is visible that tweets that contain hashtags are slighlty more likely to be about a real disaster
- Additionally, tweets without links that start with http are also more likely to be "True"

In [245]:
print(train_df[train_df['text'].str.contains('#')]['target'].value_counts(normalize=True))
print(train_df[~train_df['text'].str.contains('#')]['target'].value_counts(normalize=True))

target
0    0.503123
1    0.496877
Name: proportion, dtype: float64
target
0    0.590567
1    0.409433
Name: proportion, dtype: float64


In [246]:
print(train_df[train_df['text'].str.contains('http')]['target'].value_counts(normalize=True))
print(train_df[~train_df['text'].str.contains('http')]['target'].value_counts(normalize=True))

target
1    0.546965
0    0.453035
Name: proportion, dtype: float64
target
0    0.698243
1    0.301757
Name: proportion, dtype: float64


- Top 5 and bottom 5 keywords by their "likelihood" of indicating a real disaster tweet are displayed below. It is obvious that this feature might be very useful for predicting real/fake disaster tweet

In [247]:
train_df.groupby(["keyword"])['target'].agg(["mean", "count"]).sort_values(by="count", ascending=False).head(5)

Unnamed: 0_level_0,mean,count
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
fatalities,0.577778,45
deluge,0.142857,42
armageddon,0.119048,42
sinking,0.195122,41
damage,0.463415,41


In [248]:
train_df.groupby(["keyword"])['target'].agg(["mean", "count"]).sort_values(by="mean", ascending=False).head(5)

Unnamed: 0_level_0,mean,count
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
debris,1.0,37
wreckage,1.0,39
derailment,1.0,39
outbreak,0.975,40
oil%20spill,0.973684,38


In [249]:
train_df.groupby(["keyword"])['target'].agg(["mean", "count"]).sort_values(by="mean", ascending=False).tail(5)

Unnamed: 0_level_0,mean,count
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
body%20bag,0.030303,33
blazing,0.029412,34
ruin,0.027027,37
body%20bags,0.02439,41
aftershock,0.0,34


- Below are displayed some of the locations, their count and a mean target value for each group

In [250]:
train_df.groupby(["location"])['target'].agg(["mean", "count"]).sort_values(by=["count", "mean"], ascending=False)

Unnamed: 0_level_0,mean,count
location,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,0.644231,104
New York,0.225352,71
United States,0.540000,50
London,0.355556,45
Canada,0.448276,29
...,...,...
"ÌøåÀå_T: 40.736324,-73.990062",0.000000,1
å_: ?? ÌÑ ? : ?,0.000000,1
å_å_Los Mina Cityã¢,0.000000,1
å¡å¡Midwest Û¢Û¢,0.000000,1


In [251]:
# token = nltk.tokenize.RegexpTokenizer(r"\w+")

# sentence=token.tokenize('barbado Bridgetown JAMAICA two car set ablaze SANTA CRUZ Head of the St Elizabeth Police Superintende')
# sentence

## 2. Data preprocessing and feature engineering

- the corresponding functions are defined below

In [385]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text_nltk(text):
    return ' '.join([lemmatizer.lemmatize(w).lower() for w in w_tokenizer.tokenize(text)])

def lemmatize_text_spcy(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

In [253]:
def preprocess_data(df):
  data = df.copy()

  data['haslink'] = data['text'].str.contains('http').astype('int')
  data['hastag'] = data['text'].str.contains('#').astype('int')

  data['text'] = data['text'].replace(r'(http\S+) | (www\S+)|(@\S+) | (\&lt;|\&gt;|\&amp;) | (\x89\w+)', ' ', regex=True)

  data['text_nltk'] = data['text'].apply(lemmatize_text_nltk)
  data['text_spcy'] = data['text'].apply(lemmatize_text_spcy)

  data['text_nltk'] = data['text_nltk'].replace(r'(\W+)', ' ', regex=True)
  data['text_spcy'] = data['text_spcy'].replace(r'(\W+)', ' ', regex=True)

  data['token_nltk'] = data['text_nltk'].map(lambda x: [w for w in w_tokenizer.tokenize(x)])
  data['token_spcy'] = data['text_spcy'].map(lambda x: [token for token in nlp(x)])

  return data

In [339]:
def get_embeddings(model, train, test, token_feature, target):

  model.build_vocab(train[token_feature])
  model.train(train[token_feature], total_examples=model.corpus_count, epochs=model.epochs)

  X_train = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(model.vector_size)], axis=0) for words in train[token_feature]])
  y_train = train[target].values.ravel()

  X_test = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(model.vector_size)], axis=0) for words in test[token_feature]])

  return X_train, y_train, X_test

In [255]:
def get_train_val(data, drop_cols, target, test_size=0.2):
  train, val = train_test_split(data, test_size=test_size, random_state=0, shuffle=True, stratify=train_df[target])

  X_train = train.drop(columns=drop_cols + target)
  y_train = train[target].values.ravel()

  X_val = val.drop(columns=drop_cols + target)
  y_val = val[target].values.ravel()

  return X_train, y_train, X_val, y_val

In [256]:
data = preprocess_data(train_df)
test_data = preprocess_data(test_df)

In [257]:
data.head(3)

Unnamed: 0_level_0,keyword,location,text,target,haslink,hastag,text_nltk,text_spcy,token_nltk,token_spcy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,0,1,our deeds are the reason of this earthquake ma...,our deed be the Reason of this earthquake may ...,"[our, deeds, are, the, reason, of, this, earth...","[our, deed, be, the, Reason, of, this, earthqu..."
4,,,Forest fire near La Ronge Sask. Canada,1,0,0,forest fire near la ronge sask canada,forest fire near La Ronge Sask Canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, La, Ronge, Sask, Canada]"
5,,,All residents asked to 'shelter in place' are ...,1,0,0,all resident asked to shelter in place are bei...,all resident ask to shelter in place be be not...,"[all, resident, asked, to, shelter, in, place,...","[all, resident, ask, to, shelter, in, place, b..."


In [258]:
data.head(3)['text'].values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
      dtype=object)

In [259]:
data.head(3)['token_nltk'].values

array([list(['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'u', 'all']),
       list(['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']),
       list(['all', 'resident', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'order', 'are', 'expected'])],
      dtype=object)

In [260]:
data.head(3)['token_spcy'].values

array([list([our, deed, be, the, Reason, of, this, earthquake, may, ALLAH, forgive, we, all]),
       list([forest, fire, near, La, Ronge, Sask, Canada]),
       list([all, resident, ask, to, shelter, in, place, be, be, notify, by, officer, no, other, evacuation, or, shelter, in, place, order, be, expect])],
      dtype=object)

In [261]:
stop_words = list(stopwords.words('english'))

## 3. Modeling and evaluation

### TF-IDF

The logistic regression model will be tuned using the tf-idf vectorized features on the preprocessed text lemmatized using the NLTK library

In [101]:
drop_cols = []
target = ['target']

X_train, y_train, X_val, y_val = get_train_val(data, drop_cols, target)

In [103]:
preprocessor = ColumnTransformer(
    [('tfidf', TfidfVectorizer(encoding='unicode'), 'text_nltk')])

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'preproc__tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preproc__tfidf__max_features': [1000, 3000, None],
    'preproc__tfidf__min_df': [0.0001, 0.001, 0.01],
    'clf__C': [0.1, 1.0, 10.0],
    'clf__class_weight': ['balanced'],
}

nltk_search = GridSearchCV(pipeline, parameters, cv=3, verbose=1, scoring='f1')
nltk_search.fit(X_train, y_train)

print('Best score: %.3f' % nltk_search.best_score_)
print('Config: %s' % nltk_search.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best score: 0.747
Config: {'clf__C': 10.0, 'clf__class_weight': 'balanced', 'preproc__tfidf__max_features': None, 'preproc__tfidf__min_df': 0.0001, 'preproc__tfidf__ngram_range': (1, 1)}


In [105]:
print('Train F1 score:', f1_score(y_train, nltk_search.predict(X_train)))
print('Validation F1 score:', f1_score(y_val, nltk_search.predict(X_val)))

Train F1 score: 0.9794822627037392
Validation F1 score: 0.7598455598455598


- Next, a model with the same parameters will be trained on text lemmatized with the spacy library

In [106]:
preprocessor = ColumnTransformer(
    [('tfidf', TfidfVectorizer(encoding='unicode'), 'text_spcy')])

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'preproc__tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'preproc__tfidf__max_features': [1000, 3000, None],
    'preproc__tfidf__min_df': [0.0001, 0.001, 0.01],
    'clf__C': [0.1, 1.0, 10.0],
    'clf__class_weight': ['balanced'],
}

spcy_search = GridSearchCV(pipeline, parameters, cv=3, verbose=1, scoring='f1')
spcy_search.fit(X_train, y_train)

print('Best score: %.3f' % spcy_search.best_score_)
print('Config: %s' % spcy_search.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best score: 0.749
Config: {'clf__C': 1.0, 'clf__class_weight': 'balanced', 'preproc__tfidf__max_features': 3000, 'preproc__tfidf__min_df': 0.0001, 'preproc__tfidf__ngram_range': (1, 1)}


In [107]:
print('Train F1 score:', f1_score(y_train, spcy_search.predict(X_train)))
print('Validation F1 score:', f1_score(y_val, spcy_search.predict(X_val)))

Train F1 score: 0.8428683385579937
Validation F1 score: 0.7832817337461301


- As a result, model trained on text data lemmatized with Spacy library showed a slightly better performance.

- As a next step, using TruncatedSVD over TF-IDF features in order to lower the dimensionality of the training data. Tuning parameters with RandomisedSearchCV.

In [108]:
preprocessor = ColumnTransformer(
    [('tfidf', TfidfVectorizer(encoding='unicode'), 'text_spcy')]
    )

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('svd', TruncatedSVD(n_components=300, n_iter=5, random_state=0)),
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'preproc__tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    #'preproc__tfidf__min_df': uniform(0.0001, 0.01),
    #'preproc__tfidf__max_features': randint(500, 3000),
    'preproc__tfidf__stop_words': [stop_words, None],

    #'svd__n_iter': randint(5, 50),
    #'svd__n_components': [10, 100, 500],

    'clf__C': uniform(0.01, 10),
    'clf__class_weight': ['balanced'],
    'clf__max_iter': [300]
}

rand_search = RandomizedSearchCV(pipeline, parameters, cv=3, n_iter=30, verbose=1, scoring='f1', random_state=0)
rand_search.fit(X_train, y_train)

print('Best score: %.3f' % rand_search.best_score_)
print('Config: %s' % rand_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.736
Config: {'clf__C': 5.298949197529044, 'clf__class_weight': 'balanced', 'clf__max_iter': 300, 'preproc__tfidf__ngram_range': (1, 1), 'preproc__tfidf__stop_words': None}


In [109]:
print('Train F1 score:', f1_score(y_train, rand_search.predict(X_train)))
print('Validation F1 score:', f1_score(y_val, rand_search.predict(X_val)))

Train F1 score: 0.7721859393008068
Validation F1 score: 0.7691131498470948


- In summary, the model that utilized the SVD transormation of the data shows similar to previous models performance and at the same time, it did not overfit training data as much as previous models.

### Word2Vec

The logistic regression model will be tuned using the Word2Vec model embeddings

In [347]:
w2vmodel = Word2Vec(vector_size=300, window=4, min_count=4, sg=1)

X_train, y_train, X_test = get_embeddings(w2vmodel, data, test_data, 'token_nltk', 'target')

X_tr, X_vl, y_tr, y_vl = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True, stratify=y_train)

In [348]:
pipeline = Pipeline([
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'clf__C': uniform(0.01, 10),
    'clf__class_weight': ['balanced'],
    'clf__max_iter': [300]
}

w2v_search = RandomizedSearchCV(pipeline, parameters, cv=3, n_iter=30, verbose=1, scoring='f1', random_state=0)
w2v_search.fit(X_tr, y_tr)

print('Best score: %.3f' % w2v_search.best_score_)
print('Config: %s' % w2v_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.681
Config: {'clf__C': 8.71012148246819, 'clf__class_weight': 'balanced', 'clf__max_iter': 300}


In [349]:
print('Train F1 score:', f1_score(y_tr, w2v_search.predict(X_tr)))
print('Validation F1 score:', f1_score(y_vl, w2v_search.predict(X_vl)))

Train F1 score: 0.6913907284768211
Validation F1 score: 0.7052551408987052


- The Logistic Regression classifier trained on Word2Vec embeddings performs worse than previous models based on tf-idf vectorized features.

- Let's sample some tweets from the training data and see which words are considered 'similar' to them according to the trained Word2Vec model

In [357]:
data['token_nltk'].sample(3).values

array([list(['perspectives', 'on', 'the', 'grateful', 'dead', 'critical', 'writings', 'contributions', 'to', 'the', 'study', 'http', 't', 'co', 'aggryhvxkr']),
       list(['1', '94', 'earthquake', 'occurred', '5km', 's', 'of', 'volcano', 'hawaii', 'at', '01', '04', 'utc', 'earthquake', 'volcano', 'http', 't', 'co', 'auf4j4owj1']),
       list(['would', 'a', 'paramedic', 'really', 'do', 'that', 'leave', 'someone', 'inside', 'a', 'building', 'that', 's', 'about', 'to', 'collapse', 'blow', 'up', 'halloikbenwill'])],
      dtype=object)

In [359]:
words=["buildings", "storm", "people", "emergency", "city", "weapon", "earthquake"]

print('Top 3 words similar to')
for w in words:
    print(w, ':', list(zip(*w2vmodel.wv.most_similar(w, topn=3)))[0])

Top 3 words similar to
buildings : ('sea', 'whirlwind', 'blaze')
storm : ('called', 'drought', 'hurricane')
people : ('rioting', 'both', 'country')
emergency : ('services', 'apc', 'area')
city : ('potus', 'rì', 'teen')
weapon : ('rioting', 'everything', 'tweet')
earthquake : ('52', 'edt', '09')


### FastText

In [360]:
ftmodel = FastText(vector_size=300, window=4, min_count=4, sg=1)

X_train, y_train, X_test = get_embeddings(ftmodel, data, test_data, 'token_nltk', 'target')

X_tr, X_vl, y_tr, y_vl = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True, stratify=y_train)

In [361]:
words=["weather", "storm", "people", "emergency", "city", "weapon", "security"]

print('Top 3 words similar to')
for w in words:
    print(w, ':', list(zip(*ftmodel.wv.most_similar(w, topn=3)))[0])

Top 3 words similar to
weather : ('leather', 'further', 'father')
storm : ('sandstorm', 'abstorm', 'windstorm')
people : ('whether', 'either', 'whats')
emergency : ('aussies', 'icemoon', 'security')
city : ('spaceship', 'richmond', '13')
weapon : ('else', 'able', 'double')
security : ('anchorage', 'ramag', 'soudelor')


In [362]:
pipeline = Pipeline([
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'clf__C': uniform(0.01, 10),
    'clf__class_weight': ['balanced'],
    'clf__max_iter': [300]
}

ft_search = RandomizedSearchCV(pipeline, parameters, cv=3, n_iter=30, verbose=1, scoring='f1', random_state=0)
ft_search.fit(X_tr, y_tr)

print('Best score: %.3f' % ft_search.best_score_)
print('Config: %s' % ft_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.674
Config: {'clf__C': 8.927730007820797, 'clf__class_weight': 'balanced', 'clf__max_iter': 300}


In [363]:
print('Train F1 score:', f1_score(y_tr, ft_search.predict(X_tr)))
print('Validation F1 score:', f1_score(y_vl, ft_search.predict(X_vl)))

Train F1 score: 0.6807043020511889
Validation F1 score: 0.6817518248175183


### Glove

In [369]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [370]:
glove_path = '/content/gdrive/MyDrive/Colab Notebooks/Work/glove.6B.100d.txt'
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)

In [381]:
X_train = np.array([np.mean([glove_model[w] for w in words if w in glove_model], axis=0) for words in data['text_nltk']])
y_train = data[target].values.ravel()

X_test = np.array([np.mean([glove_model[w] for w in words if w in glove_model], axis=0) for words in test_data['text_nltk']])

In [382]:
X_tr, X_vl, y_tr, y_vl = train_test_split(X_train, y_train, test_size=0.2, random_state=0, shuffle=True, stratify=y_train)

In [383]:
pipeline = Pipeline([
    ('clf', LogisticRegression(random_state=0)),
])

parameters = {
    'clf__C': uniform(0.01, 10),
    'clf__class_weight': ['balanced'],
    'clf__max_iter': [300]
}

glove_search = RandomizedSearchCV(pipeline, parameters, cv=3, n_iter=30, verbose=1, scoring='f1', random_state=0)
glove_search.fit(X_tr, y_tr)

print('Best score: %.3f' % glove_search.best_score_)
print('Config: %s' % glove_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.605
Config: {'clf__C': 8.927730007820797, 'clf__class_weight': 'balanced', 'clf__max_iter': 300}


In [384]:
print('Train F1 score:', f1_score(y_tr, glove_search.predict(X_tr)))
print('Validation F1 score:', f1_score(y_vl, glove_search.predict(X_vl)))

Train F1 score: 0.611180567958564
Validation F1 score: 0.6044191019244476


## 4. Submission file preparation

In [167]:
def train_on_full_data(train, drop_cols, target, model):
  X_train = train.drop(columns=drop_cols + target)
  y_train = train[target].values.ravel()

  model.fit(X_train, y_train)

  print('Train F1 score:', f1_score(y_train, model.predict(X_train)))

In [170]:
def make_prediction(model, filename, X_test):
  preds = model.predict(X_test)

  test_sub = sample_sub.copy()
  test_sub['target'] = preds
  test_sub.to_csv(filename, index_label='id')

In [72]:
drop_cols = []
target = ['target']
test_cols = ['text_spcy']
best_logreg = spcy_search.best_estimator_

train_on_full_data(data, drop_cols, target, best_logreg)

make_prediction(best_logreg, 'submission_spcy_3.csv', test_data[test_cols])

Train F1 score: 0.8378927622322964


In [None]:
drop_cols = ['location', 'text', 'text_spcy']
target = ['target']
test_cols = ['keyword', 'haslink', 'hastag', 'text_nltk']
best_logreg = rand_search.best_estimator_

train_on_full_data(data, drop_cols, target, best_logreg)

Train F1 score: 0.7653828448672703


In [None]:
make_prediction(best_logreg, 'submission_nltk_svd_2.csv', test_data[test_cols])

In [378]:
make_prediction(glove_search.best_estimator_, 'submission_gl.csv', X_test)