In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import sklearn
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
stop_words = set(stopwords.words('english'))
data = pd.read_csv('data/train.csv')

In [3]:
def get_hashtag_column(dataframe):
    hashtags = []
    for text in dataframe.text:
        result = re.findall('#\w+', text)
        if result != []:
            result = [w[1:].lower() for w in result]
            hashtags.append(' '.join(result))
    return hashtags

In [4]:
def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    result = []
    for t in texts:
        lemmatized_words = []
        t = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  'url', t)
        #t = re.sub('\!+', '', t)
        #t = re.sub('\?+', '', t)
        #t = re.sub('\d+[\:|\.]?\d*\s')
        t = re.sub('\d+', '', t)
        tokens = re.findall('''\d+,\d+|\w+'\w+|#?\w+-?\w+|\w+\*+\w+''', t)
        
        if tokens == []:
            print(t)
        
        for token in tokens:
            if token.lower() not in stop_words:
                lemmatized_words.append(lemmatizer.lemmatize(token).lower())
        result.append(' '.join(lemmatized_words).replace('#', ''))
    return result

In [5]:
def tokenizer(text):
    return text.split(' ')

In [6]:
all_lemmatized_texts = lemmatize_texts(data.text)
all_lemmatized_tokens = [w for t in all_lemmatized_texts for w in t.split(' ')]
print('Total words: ', len(all_lemmatized_tokens))
print('Unique_words: ', len(set(all_lemmatized_tokens)))

Total words:  72070
Unique_words:  15827


In [7]:
# Most common words in dataset
freq = nltk.probability.FreqDist(all_lemmatized_tokens)
freq.most_common(20)

[('url', 4114),
 ('û_', 349),
 ('like', 347),
 ('fire', 339),
 ('amp', 327),
 ("i'm", 248),
 ('get', 248),
 ('new', 217),
 ('one', 201),
 ('news', 199),
 ('people', 198),
 ('disaster', 159),
 ('video', 158),
 ('emergency', 156),
 ('time', 147),
 ('body', 145),
 ('day', 141),
 ('police', 141),
 ('year', 133),
 ('would', 132)]

In [8]:
# Most common words in real tweets
real_tweets = data[data.target == 1].text
real_tweets = lemmatize_texts(real_tweets)
freq_real = nltk.probability.FreqDist([w for t in real_tweets for w in t.split(' ')])
freq_real.most_common(10)

[('url', 2228),
 ('fire', 253),
 ('û_', 172),
 ('news', 142),
 ('amp', 130),
 ('disaster', 120),
 ('california', 110),
 ('suicide', 109),
 ('police', 107),
 ('people', 106)]

In [9]:
# Most common words in fake tweets
fake_tweets = data[data.target == 0].text
fake_tweets = lemmatize_texts(fake_tweets)
freq_fake = nltk.probability.FreqDist([w for t in fake_tweets for w in t.split(' ')])
freq_fake.most_common(10)

[('url', 1886),
 ('like', 254),
 ("i'm", 203),
 ('amp', 197),
 ('get', 181),
 ('û_', 177),
 ('new', 163),
 ('one', 132),
 ('body', 114),
 ('would', 98)]

In [10]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(all_lemmatized_texts, 
                                                    data.target, test_size=0.2, random_state=42)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size = 0.8, random_state=42)

In [12]:
# Vectorize texts
vectorizer = CountVectorizer(ngram_range=(1,2), tokenizer=tokenizer)
train = vectorizer.fit_transform(X_train)
val = vectorizer.transform(X_val)
test = vectorizer.transform(X_test)

## LinearSVC

In [87]:
# Train SVM (accuracy: 0.78)
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=0)
# svc_parameters = {'tol' : [1e-7, 1e-6, 1e-5], 
#                   'max_iter': np.arange(400, 1000, 200)
#                  }
# grids_svc = GridSearchCV(svc, svc_parameters, n_jobs=-1, cv=5)
svc.fit(train, y_train)
svc.score(val, y_val)

0.7783251231527094

In [88]:
# grids_clf.fit(train, y_train)

In [89]:
# grids_clf.best_score_, grids_clf.best_params_

## RandomForestClassifier

In [90]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=42, 
                                n_estimators=71, 
                                min_samples_leaf=2, 
                                max_features=300, 
                                oob_score=True)
forest.fit(train, y_train)
# forest_params = {'max_features': np.arange(100, 1000, 100)}
# grid_forest = GridSearchCV(forest, forest_params, n_jobs=-1, cv=5)
print(forest.oob_score_)

0.7898193760262726


In [91]:
forest.score(val, y_val)

0.7824302134646962

In [92]:
importance = sorted(zip(vectorizer.get_feature_names(), forest.feature_importances_), key=lambda x: x[1], reverse=True)
for imp in importance[:20]: print("Feature '{}', importance={}".format(*imp))

Feature 'url', importance=0.038834826701685504
Feature 'fire', importance=0.014192495093343922
Feature 'hiroshima', importance=0.010890804969129145
Feature 'killed', importance=0.008864163516005616
Feature 'suicide', importance=0.007806427495297494
Feature 'california', importance=0.007719488555323708
Feature 'train', importance=0.006713296992594389
Feature 'earthquake', importance=0.006668208361033888
Feature 'japan', importance=0.00631094108572016
Feature 'storm', importance=0.0056789492384608856
Feature 'bombing', importance=0.005557711470771259
Feature 'attack', importance=0.005226616789743979
Feature 'wildfire', importance=0.005109212224453291
Feature 'evacuated', importance=0.004866605547471444
Feature 'disaster', importance=0.00483162619307037
Feature 'mh', importance=0.004543441517120972
Feature 'massacre', importance=0.004405012795758994
Feature 'terrorist', importance=0.004375640380339555
Feature 'atomic', importance=0.004348805204204863
Feature 'sinkhole', importance=0.00421

In [18]:
# grid_forest.fit(train, y_train)

In [19]:
# grid_forest.best_score_, grid_forest.best_params_

In [20]:
# (0.7779967159277504, {'n_estimators': 71})
# (0.790311986863711, {'max_features': 'auto'})
# (0.7957307060755336, {'max_features': 300})

## AdaBoostClassifier

In [27]:
from sklearn.ensemble import AdaBoostClassifier
boost = AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(),
                           random_state=42, algorithm='SAMME', learning_rate=0.08,
                           n_estimators = 7
                          )
boost_params = {'n_estimators': np.arange(1, 10),
                'learning_rate': np.arange(0.01, 0.1, 0.01)
                }
grid_boost = GridSearchCV(boost, boost_params, n_jobs=-1, cv=5)
boost.fit(train, y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                            

In [84]:
boost.score(val, y_val)

0.7216748768472906

In [25]:
# grid_boost.fit(train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME',
                                          base_estimator=DecisionTreeClassifier(class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=None,
                                                                                max_features=None,
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                                mi

In [26]:
# grid_boost.best_score_, grid_boost.best_params_

(0.7489737274220033, {'learning_rate': 0.08, 'n_estimators': 7})

In [52]:
type(boost)

sklearn.ensemble.weight_boosting.AdaBoostClassifier

# Neural Network

In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False
    

In [59]:
def data_loader(data, batch_size, shuffle=False):
    features = data[0]
    target = data[1]
    n_samples = features.shape[0]
    
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_indices = indices[start : end]
        X_batch = features[batch_indices].toarray()
        y_batch = target.values[batch_indices]
        yield X_batch, y_batch

In [60]:
def fit(model, loss_function, train_data=None, val_data=None, optimizer=None,
        epoch_count=1, batch_size=1, scheduler=None, alpha=1):
    train_history = []
    val_history = []
    best_model = None
    for epoch in range(epoch_count):
            name_prefix = '[{} / {}] '.format(epoch + 1, epoch_count)
            epoch_train_score = 0
            epoch_val_score = 0
            
            if train_data:
                epoch_train_score = do_epoch(model, loss_function, train_data, batch_size, 
                                              optimizer, name_prefix + 'Train:', alpha=alpha
                                            )
                train_history.append(epoch_train_score)

            if val_data:
                name = '  Val:'
                if not train_data:
                    name = ' Test:'
                epoch_val_score = do_epoch(model, loss_function, val_data, batch_size, 
                                             optimizer=None, name=name_prefix + name, alpha=alpha
                                          )
                
                val_history.append(epoch_val_score)
                if scheduler:
                    scheduler.step(epoch_val_score)
            elif scheduler:
                scheduler.step(epoch_train_score)

    return train_history, val_history
    

In [61]:
def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None, alpha=1):
    """
       Генерация одной эпохи
    """
    accuracy = 0
    epoch_loss = 0
   
    batch_count = int(data[0].shape[0] / batch_size)
   
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batch_count) as progress_bar:               
            for ind, (X, y) in enumerate(data_loader(data, batch_size)):
                X_batch, y_batch = FloatTensor(X).to(device), LongTensor(y).to(device)
                
                prediction = model(X_batch)
                
                loss = loss_function(prediction, y_batch)
                
                for param in model.children():
                    if type(param) == nn.Linear:
                        loss += alpha * torch.abs(param.weight).sum()
                        
                epoch_loss += loss.item()
                
                true_indices = torch.argmax(prediction, dim=1)
                correct_samples = torch.sum(true_indices == y_batch).cpu().numpy()
                accuracy += correct_samples / y_batch.shape[0]
                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('Epoch {} - accuracy: {:.2f}, loss {:.2f}'.format(
                    name, (accuracy / (ind+1)), epoch_loss / (ind+1))
                )
            
            accuracy /= (ind + 1)
            epoch_loss /= (ind + 1) 
            progress_bar.set_description(f'Epoch {name} - accuracy: {accuracy:.2f}, loss: {epoch_loss:.2f}')

    return accuracy

## LinearNN

In [96]:
class NNModel():
    def __init__(self, model):
        self.model = model
        
    def predict(self, inputs):
        self.model.eval()
        output = pd.DataFrame()
        for ind in range(inputs.shape[0]):
            X = FloatTensor(test[ind].toarray())
            predict = self.model(X)
            true_indices = torch.argmax(predict, dim=1).detach().cpu().numpy()
            output.loc[ind, 'target'] = true_indices
        return output.values

In [103]:
# fit settings
batch_size = 100
epoch_count = 10

# optim settings
learning_rate = 1e-3
weight_decay = 0
alpha = 0.0001

# model settings
linear1_out = int(train.shape[1]**0.5)
output = 2
dropout = 0.3

# scheduler settings
factor = 0.5
patience = 3
threshold = 1e-2

model = nn.Sequential(nn.Linear(train.shape[1], linear1_out),
#                       nn.BatchNorm1d(linear1_out),
#                       nn.Dropout(p=dropout, inplace=True),
                      nn.ReLU(inplace=True),
                      nn.Linear(linear1_out, output),
                      nn.ReLU(inplace=True)
                     ).to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

In [104]:
optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

In [105]:
_, _ = fit(model, loss_function, train_data=(train, y_train), 
           optimizer=optimizer, batch_size=100, epoch_count=epoch_count,
           alpha = alpha, val_data=(val, y_val), scheduler=scheduler
          )

Epoch [1 / 10] Train: - accuracy: 0.74, loss: 1.07: : 49it [00:01, 27.71it/s]                     
Epoch [1 / 10]   Val: - accuracy: 0.77, loss: 0.88: : 13it [00:00, 40.09it/s]                     
Epoch [2 / 10] Train: - accuracy: 0.78, loss: 0.89: : 49it [00:01, 28.09it/s]                     
Epoch [2 / 10]   Val: - accuracy: 0.78, loss: 0.94: : 13it [00:00, 39.72it/s]                     
Epoch [3 / 10] Train: - accuracy: 0.80, loss: 0.94: : 49it [00:01, 33.16it/s]                     
Epoch [3 / 10]   Val: - accuracy: 0.78, loss: 0.99: : 13it [00:00, 33.72it/s]                     
Epoch [4 / 10] Train: - accuracy: 0.81, loss: 0.96: : 49it [00:01, 28.93it/s]                     
Epoch [4 / 10]   Val: - accuracy: 0.78, loss: 1.01: : 13it [00:00, 38.45it/s]                     
Epoch [5 / 10] Train: - accuracy: 0.83, loss: 0.95: : 49it [00:01, 27.66it/s]                     
Epoch [5 / 10]   Val: - accuracy: 0.77, loss: 1.02: : 13it [00:00, 38.39it/s]                     
Epoch [6 /

Epoch     6: reducing learning rate of group 0 to 5.0000e-04.


Epoch [7 / 10] Train: - accuracy: 0.87, loss: 0.74: : 49it [00:01, 27.77it/s]                     
Epoch [7 / 10]   Val: - accuracy: 0.78, loss: 0.78: : 13it [00:00, 39.53it/s]                     
Epoch [8 / 10] Train: - accuracy: 0.87, loss: 0.66: : 49it [00:01, 27.51it/s]                     
Epoch [8 / 10]   Val: - accuracy: 0.78, loss: 0.79: : 13it [00:00, 38.57it/s]                     
Epoch [9 / 10] Train: - accuracy: 0.87, loss: 0.66: : 49it [00:01, 26.67it/s]                     
Epoch [9 / 10]   Val: - accuracy: 0.78, loss: 0.80: : 13it [00:00, 38.56it/s]                     
Epoch [10 / 10] Train: - accuracy: 0.87, loss: 0.65: : 49it [00:01, 28.00it/s]                     
Epoch [10 / 10]   Val: - accuracy: 0.78, loss: 0.80: : 13it [00:00, 34.67it/s]                     

Epoch    10: reducing learning rate of group 0 to 2.5000e-04.





## Create ensemble

In [100]:
models = []
models.append(svc)
models.append(forest)
# models.append(boost)
models.append(NNModel(model))

In [98]:
def ensemble(models, data):
    predicts = pd.DataFrame()
    for i, model in enumerate(models):
        predicts[i] = model.predict(data)
    result = predicts.apply(lambda row: row.value_counts().index[0], axis=1)
    return result.values

In [99]:
accuracy_score(ensemble(models, val), y_val)

0.7799671592775042

## Submission

In [None]:
# choose your best model
final_model = NNModel(model)

In [None]:
def submission(model, vectorizer, path="data/submission_test.csv"):
    test_data = pd.read_csv("data/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    submit = pd.DataFrame()
    submit['id'] = test_data['id']
    submit['target'] = model.predict(test)
    submit['target'] = submit['target'].astype('int')
    submit.to_csv(path, index=False)
    
submission(final_model, vectorizer)

def submission(model, vectorizer, path="data/submission_test.csv"):
    test_data = pd.read_csv("data/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    submis = pd.DataFrame()
    submis['id'] = test_data['id']
    submit['target'] = model.predict(test)
    submis['target'] = submis['target'].astype('int')
    submis.to_csv(path, index=False)
    
submission(final_mode, vectorizer)

In [None]:
# choose your best model
final_model = NNModel(model)

In [201]:
def submission(model, vectorizer, path="data/submission_test.csv"):
    test_data = pd.read_csv("data/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    submis = pd.DataFrame()
    submis['id'] = test_data['id']
    submit['target'] = model.predict(test)
    submis['target'] = submis['target'].astype('int')
    submis.to_csv(path, index=False)
    
submission(final_mode, vectorizer)