In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [3]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [4]:
train_df[train_df["target"]==0]["text"].values[3]

'My car is so fast'

In [5]:
train_df[train_df["target"]==1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

**Building Vectors**
The theory behind the model we'll build in this notebook is pretty simple: the number of words contained in each tweet are a good indicator of whether they're about a real disaster or not.

We'll use scikit-learn's CountVectorizer to count the words in each tweet and turn them into data our machine learning model can process.



In [6]:
count_vectorizer = feature_extraction.text.CountVectorizer()

# let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])

In [7]:
# we use .todense() here because these vectors are "sparse"
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


The above tells us that:

1. There are 54 unique words (or "tokens") in the first five tweets.
2. The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet.

Now let's create vectors for all of our tweets.

In [8]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])

test_vectors = count_vectorizer.transform(test_df['text'])

In [9]:
clf = linear_model.RidgeClassifier()

In [10]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df['target'],
                                        cv=3, scoring='f1')
scores

array([0.59453669, 0.5642787 , 0.64082434])

There are a lot of ways to potentially improve on this (TFIDF, LSA, LSTM / RNNs, the list is long!) - give any of them a shot!

In [11]:
clf.fit(train_vectors, train_df['target'])