In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
print(train_df.head(5))
print(test_df.head(5))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [8]:
#remove NA
train_df['keyword'].fillna("None", inplace = True)
train_df['location'].fillna("None", inplace = True)
test_df['keyword'].fillna("None", inplace = True)
test_df['location'].fillna("None", inplace = True)
print(train_df.head(5))
print(test_df.head(5))

   id keyword location                                               text  \
0   1    None     None  Our Deeds are the Reason of this earthquake Ma...   
1   4    None     None              Forest fire near La Ronge Sask Canada   
2   5    None     None  All residents asked to shelter in place are be...   
3   6    None     None  13000 people receive wildfires evacuation orde...   
4   7    None     None  Just got sent this photo from Ruby Alaska as s...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0    None     None                 Just happened a terrible car crash
1   2    None     None  Heard about #earthquake is different cities, s...
2   3    None     None  there is a forest fire at spot pond, geese are...
3   9    None     None           Apocalypse lighting. #Spokane #wildfires
4  11    None     None      Typhoon Soudelor kills 28 in China and Taiwan


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re

train_keyword = train_df["keyword"].values
train_location = train_df["location"].values
train_text = train_df["text"].values

#remove useless characters
for i in range(len(train_keyword)) :
    train_keyword[i] = re.sub('[^0-9a-zA-Z ]', '', train_keyword[i])
    train_location[i] = re.sub('[^0-9a-zA-Z ]', '', train_location[i])
    train_text[i] = re.sub('[^0-9a-zA-Z ]', '', train_text[i])

#make String data to numeric data
tfidf_keyword = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range = (1, 2))
tfidf_keyword.fit(train_keyword)
train_keyword = tfidf_keyword.transform(train_keyword).toarray()

tfidf_location = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range = (1, 2))
tfidf_location.fit(train_location)
train_location = tfidf_keyword.transform(train_location).toarray()

tfidf_text = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range = (1, 2))
tfidf_text.fit(train_text)
train_text = tfidf_keyword.transform(train_text).toarray()

train_x = []
train_y = train_df["target"].values

for i in range(len(train_y)) :
    train_x.append([train_keyword[i], train_location[i], train_text[i]])
    
train_x = np.array(train_x, dtype=object)
print(train_x.shape) #7613, 3, 222
#reshape for sklearn : sklearn allows 2D shape only
train_x = train_x.reshape((train_x.shape[0], -1))
print(train_x.shape) #7613, 666

(7613, 3, 222)
(7613, 666)


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

lg = LogisticRegression(random_state=0)

#search the best parameter
params = { 'C': list(np.arange(1,10,0.1)) }

#train
grid_cv = GridSearchCV(lg , param_grid=params , cv=5 ,scoring='accuracy', verbose=1 )
grid_cv.fit(train_x , train_y)

#check best parameter and accuracy score
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 5 folds for each of 90 candidates, totalling 450 fits
{'C': 1.0} 0.5392


In [55]:
test_keyword = test_df["keyword"].values
test_location = test_df["location"].values
test_text = test_df["text"].values

#remove useless characters
for i in range(len(test_keyword)) :
    test_keyword[i] = re.sub('[^0-9a-zA-Z ]', '', test_keyword[i])
    test_location[i] = re.sub('[^0-9a-zA-Z ]', '', test_location[i])
    test_text[i] = re.sub('[^0-9a-zA-Z ]', '', test_text[i])

#make String data to numeric data
test_keyword = tfidf_keyword.transform(test_keyword).toarray()

test_location = tfidf_keyword.transform(test_location).toarray()

test_text = tfidf_keyword.transform(test_text).toarray()

test_x = []

for i in range(len(test_keyword)) :
    test_x.append([test_keyword[i], test_location[i], test_text[i]])
    
test_x = np.array(test_x, dtype=object)
print(test_x.shape) #3263, 3, 222
#reshape for sklearn : sklearn allows 2D shape only
test_x = test_x.reshape((test_x.shape[0], -1))
print(test_x.shape) #3263, 666

(3263, 3, 222)
(3263, 666)


In [58]:
pred = grid_cv.predict(test_x)
pred = pd.Series(pred, name = 'target')
target_id = test_df["id"]
save_data = pd.concat([target_id, pred], axis = 1)
print(save_data.head(5))

save_data.to_csv("result.csv", index = False)

   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1
