In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df.text.isna().sum()

0

In [5]:
X = df['text']
y = df['target']

In [9]:
gb_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')),
                        ('gb_model', GradientBoostingClassifier())])

In [8]:
parameters = {'gb_model__learning_rate': [0.01,0.1,1,10,100],
              'gb_model__n_estimators': [50, 100, 500, 1000],
              'gb_model__max_depth': [1, 3, 5, 7, 9]
              }

In [10]:
randomized_search = RandomizedSearchCV(estimator=xgb_pipeline,
                                       param_distributions=parameters,
                                       n_iter=5,
                                       scoring='roc_auc',
                                       cv=4)

In [11]:
randomized_search.fit(X, y)

RandomizedSearchCV(cv=4,
                   estimator=Pipeline(steps=[('vect',
                                              CountVectorizer(stop_words='english')),
                                             ('gb_model',
                                              GradientBoostingClassifier())]),
                   n_iter=5,
                   param_distributions={'gb_model__learning_rate': [0.01, 0.1,
                                                                    1, 10,
                                                                    100],
                                        'gb_model__max_depth': [1, 3, 5, 7, 9],
                                        'gb_model__n_estimators': [50, 100, 500,
                                                                   1000]},
                   scoring='roc_auc')

In [12]:
print("Best score: ", np.sqrt(np.abs(randomized_search.best_score_)))
print("Best model: ", randomized_search.best_estimator_)

Best score:  0.8426051766365193
Best model:  Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('gb_model',
                 GradientBoostingClassifier(learning_rate=0.01, max_depth=7,
                                            n_estimators=1000))])


In [13]:
Best_model = Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('gb_model',
                 GradientBoostingClassifier(learning_rate=0.01, max_depth=7,
                                            n_estimators=1000))])

In [14]:
Best_model.fit(X, y)

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('gb_model',
                 GradientBoostingClassifier(learning_rate=0.01, max_depth=7,
                                            n_estimators=1000))])

In [15]:
testdf = pd.read_csv('test.csv')
testdf.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [18]:
X_new = testdf['text']

In [24]:
predictions = Best_model.predict(X_new)

In [26]:
predictions[:10]

array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0])

In [28]:
test_prediction = pd.Series(predictions, name='target')
id_col = testdf['id']

result = pd.concat([id_col, test_prediction], axis=1)

result.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,0
3,9,0
4,11,1


In [29]:
result.to_csv('GBClassifierRandomizedCV.csv', index=False)