In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# TfidfVectorizer + GridSearchCV

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
X = df_train['text']
y = df_train['target']

In [7]:
tfidf = TfidfVectorizer()
model = XGBClassifier()

In [8]:
pipe = Pipeline([('tfidf', tfidf), ('model', model)])

In [9]:
params = {
    'model__n_estimators': [100, 200, 250, 300],
    'model__max_depth': [3, 5, 7, 11, 13],
    'model__learning_rate': [0.1, 0.01, 0.3]
}

In [10]:
final_model = GridSearchCV(pipe, param_grid = params, cv= 3, verbose = 2 )

In [11]:
final_model.fit(X, y)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=100; total time=   2.9s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=100; total time=   2.7s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=100; total time=   2.2s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=200; total time=   4.6s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=200; total time=   4.5s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=200; total time=   4.0s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=250; total time=   5.8s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=250; total time=   5.4s
[CV] END model__learning_rate=0.1, model__max_depth=3, model__n_estimators=250; total time=   4.8s
[CV] END model__learning_rate=0.1, model__max_d

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.1, 0.01, ...], 'model__max_depth': [3, 5, ...], 'model__n_estimators': [100, 200, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
print("Best Parameters:", final_model.best_params_)
print(classification_report(y, final_model.predict(X)))

Best Parameters: {'model__learning_rate': 0.3, 'model__max_depth': 3, 'model__n_estimators': 250}
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      4342
           1       0.92      0.76      0.83      3271

    accuracy                           0.87      7613
   macro avg       0.88      0.86      0.86      7613
weighted avg       0.88      0.87      0.87      7613



In [15]:
df_sub = pd.read_csv('sample_submission.csv')
y_pred = final_model.predict(df_test['text'])
df_sub['target'] = y_pred
df_sub.to_csv('submission1.csv', index=False)

# TfidfVectorizer + LogisticRegression

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [6]:
X = df_train['text']
y = df_train['target']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=40)

In [8]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2), sublinear_tf = True)),
    ('model', LogisticRegression(class_weight='balanced',  penalty = 'l2', C = 1.0, solver = 'liblinear', max_iter=10000, ))
])

In [9]:
param_grid = {
    'tfidf__max_features': [5000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'model__C': [0.5, 1.0],
    'model__class_weight': ['balanced']
}

In [10]:
grid = GridSearchCV(pipe, param_grid, scoring='f1', cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,"{'model__C': [0.5, 1.0], 'model__class_weight': ['balanced'], 'tfidf__max_features': [5000], 'tfidf__ngram_range': [(1, ...), (1, ...)]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,10000


In [12]:
y_pred = grid.predict(X_val)

In [13]:
f1_score(y_val, y_pred)

0.7750188111361926

In [14]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       852
           1       0.78      0.77      0.78       671

    accuracy                           0.80      1523
   macro avg       0.80      0.80      0.80      1523
weighted avg       0.80      0.80      0.80      1523



In [17]:
df_sub = pd.read_csv('sample_submission.csv')
y_pred = grid.predict(df_test['text'])
df_sub['target'] = y_pred
df_sub.to_csv('submission2.csv', index=False)