In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, pipeline, base
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet')

%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
INPUT_DIR_PATH = ''
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Loking on a data

In [4]:
train_df = pd.read_csv('data/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_df[train_df['target']==1]['text'].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [7]:
train_df[train_df['target']==0]['text'].head()

15                  What's up man?
16                   I love fruits
17                Summer is lovely
18               My car is so fast
19    What a goooooooaaaaaal!!!!!!
Name: text, dtype: object

# Tokenizer

In [24]:
tweet_tokenizer = TweetTokenizer()

tokenized_data_train = [tweet_tokenizer.tokenize(sentence) for sentence in train_df['text'].values]
tokenized_data_test = [tweet_tokenizer.tokenize(sentence) for sentence in test_df['text'].values]

# Lemmatizer

In [25]:
lemmatizer = WordNetLemmatizer()

lemmatized_data_train = [
    ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    for tokens in tokenized_data_train
]

lemmatized_data_test = [
    ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    for tokens in tokenized_data_test
]

# Building vectors

## Count vectorizer

In [11]:
count_vectorizer = feature_extraction.text.CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [13]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [18]:
# Loking on a features
count_vectorizer.get_feature_names()[:10]

['000', '13', 'alaska', 'all', 'allah', 'are', 'as', 'asked', 'being', 'by']

In [19]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

## TF-IDF vectorizer

In [44]:
# Create frequency limits
MAX_DF = 0.95
MIN_DF = 0.01

tf_idf_vectorizer = feature_extraction.text.TfidfVectorizer(max_df=MAX_DF, min_df=MIN_DF)

example_train_vectors = tf_idf_vectorizer.fit_transform(train_df['text'][0:5])

In [45]:
print("Shape: ", example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

Shape:  (1, 54)
[[0.         0.         0.         0.23336118 0.28924517 0.23336118
  0.         0.         0.         0.         0.         0.
  0.28924517 0.28924517 0.         0.         0.         0.
  0.28924517 0.         0.         0.         0.         0.
  0.         0.28924517 0.         0.         0.         0.28924517
  0.         0.         0.         0.         0.28924517 0.
  0.         0.         0.         0.28924517 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.28924517 0.23336118 0.         0.28924517 0.        ]]


In [46]:
train_vectors = tf_idf_vectorizer.fit_transform(train_df['text'])
test_vectors = tf_idf_vectorizer.transform(test_df['text'])

In [78]:
type(train_df['target'][0])

numpy.int64

# Dataset

In [80]:
class SparseDataset(Dataset):
    
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        cur_features = torch.from_numpy(self.features[idx].toarray()[0]).float()
        cur_labels = torch.from_numpy(np.asarray(self.labels[idx])).long()
        
        return cur_features, cur_labels

# Submissions

## First submission

In [47]:
clf1 = linear_model.RidgeClassifier()

In [48]:
scores = model_selection.cross_val_score(clf1, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.62269939, 0.58020833, 0.64794189])



The above scores aren't terrible! It looks like our assumption will score roughly 0.65 on the leaderboard. There are lots of ways to potentially improve on this (TFIDF, LSA, LSTM / RNNs, the list is long!) - give any of them a shot!

In [49]:
clf1.fit(train_vectors, train_df['target'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [30]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [53]:
prediction1 = clf1.predict(test_vectors)

sub1 = pd.DataFrame(data=sample_submission['id'].values, columns=['id'])
sub1['target'] = prediction1

sub1.to_csv('submissions/sub1.csv', index=False)

In [54]:
sub1.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,0
3,9,0
4,11,1


## Second submission

In [59]:
clf2 = linear_model.RidgeClassifier()

scores = model_selection.cross_val_score(clf1, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.62269939, 0.58020833, 0.64794189])

In [60]:
clf2.fit(train_vectors, train_df['target'])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [61]:
prediction2 = clf2.predict(test_vectors)

sub2 = pd.DataFrame(data=sample_submission['id'].values, columns=['id'])
sub2['target'] = prediction2

sub2.to_csv('submissions/sub2.csv', index=False)

In [62]:
sub2.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,0
3,9,0
4,11,1


## Third submission

In [85]:
"""TF-IDF without frequency limits. Chose penalty for Logistic regression."""

tf_idf_vectorizer = feature_extraction.text.TfidfVectorizer()

train_vectors = tf_idf_vectorizer.fit_transform(train_df['text'])
test_vectors = tf_idf_vectorizer.transform(test_df['text'])

In [91]:
log_reg_clf1 = linear_model.LogisticRegression(penalty='l1', solver='saga', n_jobs=-1, max_iter=10000)
scores = model_selection.cross_val_score(log_reg_clf1, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.5513054 , 0.52852154, 0.57535754])

In [94]:
log_reg_clf2 = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=-1, max_iter=10000)
scores = model_selection.cross_val_score(log_reg_clf2, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.62111801, 0.59760087, 0.68120457])

In [95]:
log_reg_clf3 = linear_model.LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, n_jobs=-1, max_iter=10000)
scores = model_selection.cross_val_score(log_reg_clf3, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.58352941, 0.57063093, 0.62480043])

In [96]:
log_reg_clf2.fit(train_vectors, train_df['target'])

prediction3 = log_reg_clf2.predict(test_vectors)

sub3 = pd.DataFrame(data=sample_submission['id'].values, columns=['id'])
sub3['target'] = prediction3

sub3.to_csv('submissions/sub3.csv', index=False)

In [97]:
sub3.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


# Pipline

In [44]:
pipline1 = pipeline.Pipeline([
    ('tf_idf_vectorizer', feature_extraction.text.TfidfVectorizer()), 
    ('clf', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=-1, max_iter=10000))
])

In [45]:
pipline1.fit(train_df['text'], train_df['target'])

Pipeline(memory=None,
         steps=[('tf_idf_vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, inter

In [46]:
pipline1.predict(test_df['text'])

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

## Fourth submission

In [39]:
"""Tokenizer + Lemmatizer + TF-IDF Vectorizer"""

tf_idf_vectorizer = feature_extraction.text.TfidfVectorizer(max_df=0.95)

train_vectors = tf_idf_vectorizer.fit_transform(lemmatized_data_train)
test_vectors = tf_idf_vectorizer.transform(lemmatized_data_test)

In [40]:
log_reg_clf2 = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=-1, max_iter=10000)

In [41]:
log_reg_clf2.fit(train_vectors, train_df['target'])

prediction4 = log_reg_clf2.predict(test_vectors)

sub4 = pd.DataFrame(data=sample_submission['id'].values, columns=['id'])
sub4['target'] = prediction4

sub4.to_csv('submissions/sub4.csv', index=False)

In [42]:
sub4.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


### Try to use pipline

In [50]:
# TransformerMixin gives your transformer the very useful .fit_transform method.
# BaseEstimator gives your transformer grid-searchable parameters.

class MyTweetTokenizer(base.TransformerMixin, base.BaseEstimator):
    def __init__(self):
        self.tweet_tokenizer = TweetTokenizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [self.tweet_tokenizer.tokenize(sentence) for sentence in X.values]

In [52]:
class MyLemmatizer(base.TransformerMixin, base.BaseEstimator):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [' '.join([self.lemmatizer.lemmatize(word) 
                          for word in tokens])
                for tokens in X]

In [53]:
pipline2 = pipeline.Pipeline([
    ('tokenizer', MyTweetTokenizer()),
    ('lemmatizer', MyLemmatizer()),
    ('vectorizer', feature_extraction.text.TfidfVectorizer()),
    ('clf', linear_model.LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=-1, max_iter=10000))
])

pipline2 = pipline2.fit(train_df['text'], train_df['target'])

In [54]:
prediction4 = pipline2.predict(test_df['text'])

sub4 = pd.DataFrame(data=sample_submission['id'].values, columns=['id'])
sub4['target'] = prediction4

sub4.to_csv('submissions/sub4.csv', index=False)

In [55]:
sub4.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


# Plan for next submissions

План:
1. Потренируйся писать сложные регулярки
2. Learning rate sceduler
3. AUC ROC, F1
4. RNN
5. Attention
6. Графовые свёрточные нейросети(почитай про них)
7. Посмотри лекцию про эмбеддинги
8. PosTegging