In [102]:
import pandas as pd
import numpy as np
import nltk
import re
import sklearn
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score

In [103]:
stop_words = set(stopwords.words('english'))
data = pd.read_csv('data/train.csv')

In [104]:
def get_hashtag_column(dataframe):
    hashtags = []
    for text in dataframe.text:
        result = re.findall('#\w+', text)
        if result != []:
            result = [w[1:].lower() for w in result]
            hashtags.append(' '.join(result))
    return hashtags

In [105]:
def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    result = []
    for t in texts:
        lemmatized_words = []
        t = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  'url', t)
        #t = re.sub('\!+', '', t)
        #t = re.sub('\?+', '', t)
        #t = re.sub('\d+[\:|\.]?\d*\s')
        t = re.sub('\d+', '', t)
        tokens = re.findall('''\d+,\d+|\w+'\w+|#?\w+-?\w+|\w+\*+\w+''', t)
        
        if tokens == []:
            print(t)
        
        for token in tokens:
            if token.lower() not in stop_words:
                lemmatized_words.append(lemmatizer.lemmatize(token).lower())
        result.append(' '.join(lemmatized_words).replace('#', ''))
    return result

In [106]:
def tokenizer(text):
    return text.split(' ')

In [107]:
all_lemmatized_texts = lemmatize_texts(data.text)
all_lemmatized_tokens = [w for t in all_lemmatized_texts for w in t.split(' ')]
print('Total words: ', len(all_lemmatized_tokens))
print('Unique_words: ', len(set(all_lemmatized_tokens)))

Total words:  72070
Unique_words:  15827


In [108]:
# Most common words in dataset
freq = nltk.probability.FreqDist(all_lemmatized_tokens)
freq.most_common(20)

[('url', 4114),
 ('û_', 349),
 ('like', 347),
 ('fire', 339),
 ('amp', 327),
 ("i'm", 248),
 ('get', 248),
 ('new', 217),
 ('one', 201),
 ('news', 199),
 ('people', 198),
 ('disaster', 159),
 ('video', 158),
 ('emergency', 156),
 ('time', 147),
 ('body', 145),
 ('day', 141),
 ('police', 141),
 ('year', 133),
 ('would', 132)]

In [109]:
# Most common words in real tweets
real_tweets = data[data.target == 1].text
real_tweets = lemmatize_texts(real_tweets)
freq_real = nltk.probability.FreqDist([w for t in real_tweets for w in t.split(' ')])
freq_real.most_common(10)

[('url', 2228),
 ('fire', 253),
 ('û_', 172),
 ('news', 142),
 ('amp', 130),
 ('disaster', 120),
 ('california', 110),
 ('suicide', 109),
 ('police', 107),
 ('people', 106)]

In [110]:
# Most common words in fake tweets
fake_tweets = data[data.target == 0].text
fake_tweets = lemmatize_texts(fake_tweets)
freq_fake = nltk.probability.FreqDist([w for t in fake_tweets for w in t.split(' ')])
freq_fake.most_common(10)

[('url', 1886),
 ('like', 254),
 ("i'm", 203),
 ('amp', 197),
 ('get', 181),
 ('û_', 177),
 ('new', 163),
 ('one', 132),
 ('body', 114),
 ('would', 98)]

In [124]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(all_lemmatized_texts, 
                                                    data.target, test_size=0.2, random_state=42)

In [125]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size = 0.8, random_state=42)

In [126]:
# Vectorize texts
vectorizer = CountVectorizer(ngram_range=(1,2), tokenizer=tokenizer)
train = vectorizer.fit_transform(X_train)
val = vectorizer.transform(X_val)
test = vectorizer.transform(X_test)

## LinearSVC

In [None]:
# Train SVM (accuracy: 0.78)
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=0)
# svc_parameters = {'tol' : [1e-7, 1e-6, 1e-5], 
#                   'max_iter': np.arange(400, 1000, 200)
#                  }
# grids_svc = GridSearchCV(svc, svc_parameters, n_jobs=-1, cv=5)
svc.fit(train, y_train)
# clf.score(test, y_test)

In [None]:
# grids_clf.fit(train, y_train)

In [None]:
# grids_clf.best_score_, grids_clf.best_params_

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=42, 
                                n_estimators=71, 
                                min_samples_leaf=2, 
                                max_features=300, 
                                oob_score=True)
forest.fit(train, y_train)
# forest_params = {'max_features': np.arange(100, 1000, 100)}
# grid_forest = GridSearchCV(forest, forest_params, n_jobs=-1, cv=5)
print(forest.oob_score_)

In [None]:
# grid_forest.fit(train, y_train)

In [None]:
# grid_forest.best_score_, grid_forest.best_params_

In [None]:
# (0.7779967159277504, {'n_estimators': 71})
# (0.790311986863711, {'max_features': 'auto'})
# (0.7957307060755336, {'max_features': 300})

## AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
boost = AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=10),
                           random_state=42, algorithm='SAMME')
boost_params = {'n_estimators': np.arange(1, 10),
                'learning_rate': np.arange(0.01, 0.1, 0.01)
                }
grid_boost = GridSearchCV(boost, boost_params, n_jobs=-1, cv=5)
# boost.fit(train, y_train)

In [None]:
grid_boost.fit(train, y_train)

In [None]:
grid_boost.best_score_, grid_boost.best_params_

# Neural Network

In [127]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False
    

In [128]:
def data_loader(data, batch_size, shuffle=False):
    features = data[0]
    target = data[1]
    n_samples = features.shape[0]
    
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_indices = indices[start : end]
        X_batch = features[batch_indices].toarray()
        y_batch = target.values[batch_indices]
        yield X_batch, y_batch

In [129]:
def fit(model, loss_function, train_data=None, val_data=None, optimizer=None,
        epoch_count=1, batch_size=1, scheduler=None, alpha=1):
    train_history = []
    val_history = []
    best_model = None
    for epoch in range(epoch_count):
            name_prefix = '[{} / {}] '.format(epoch + 1, epoch_count)
            epoch_train_score = 0
            epoch_val_score = 0
            
            if train_data:
                epoch_train_score = do_epoch(model, loss_function, train_data, batch_size, 
                                              optimizer, name_prefix + 'Train:', alpha=alpha
                                            )
                train_history.append(epoch_train_score)

            if val_data:
                name = '  Val:'
                if not train_data:
                    name = ' Test:'
                epoch_val_score = do_epoch(model, loss_function, val_data, batch_size, 
                                             optimizer=None, name=name_prefix + name, alpha=alpha
                                          )
                
                val_history.append(epoch_val_score)
                if scheduler:
                    scheduler.step(epoch_val_score)
            elif scheduler:
                scheduler.step(epoch_train_score)

    return train_history, val_history
    

In [130]:
def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None, alpha=1):
    """
       Генерация одной эпохи
    """
    accuracy = 0
    epoch_loss = 0
   
    batch_count = int(data[0].shape[0] / batch_size)
   
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batch_count) as progress_bar:               
            for ind, (X, y) in enumerate(data_loader(data, batch_size)):
                X_batch, y_batch = FloatTensor(X).to(device), LongTensor(y).to(device)
                
                prediction = model(X_batch)
                
                loss = loss_function(prediction, y_batch)
                
                for param in model.children():
                    if type(param) == nn.Linear:
                        loss += alpha * torch.abs(param.weight).sum()
                        
                epoch_loss += loss.item()
                
                true_indices = torch.argmax(prediction, dim=1)
                correct_samples = torch.sum(true_indices == y_batch).cpu().numpy()
                accuracy += correct_samples / y_batch.shape[0]
                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('Epoch {} - accuracy: {:.2f}, loss {:.2f}'.format(
                    name, (accuracy / (ind+1)), epoch_loss / (ind+1))
                )
            
            accuracy /= (ind + 1)
            epoch_loss /= (ind + 1) 
            progress_bar.set_description(f'Epoch {name} - accuracy: {accuracy:.2f}, loss: {epoch_loss:.2f}')

    return accuracy

## LinearNN

In [182]:
# fit settings
batch_size = 100
epoch_count = 10

# optim settings
learning_rate = 1e-3
weight_decay = 0
alpha = 0.005

# model settings
linear1_out = int(train.shape[1]**0.5)
output = 2

# scheduler settings
factor = 0.5
patience = 1
threshold = 1e-2

model = nn.Sequential(nn.Linear(train.shape[1], linear1_out),
                      nn.BatchNorm1d(linear1_out),
                      nn.ReLU(inplace=True),
                      nn.Linear(linear1_out, output),
                      nn.ReLU(inplace=True)
                     ).to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

In [183]:
_, _ = fit(model, loss_function, train_data=(train, y_train), 
           optimizer=optimizer, batch_size=100, epoch_count=epoch_count,
           alpha = alpha, val_data=(val, y_val), scheduler=scheduler
          )

Epoch [1 / 10] Train: - accuracy: 0.71, loss: 20.50: : 49it [00:01, 29.85it/s]                     
Epoch [1 / 10]   Val: - accuracy: 0.74, loss: 9.91: : 13it [00:00, 31.04it/s]                     
Epoch [2 / 10] Train: - accuracy: 0.75, loss: 9.07: : 49it [00:01, 29.97it/s]                     
Epoch [2 / 10]   Val: - accuracy: 0.77, loss: 8.59: : 13it [00:00, 30.38it/s]                     
Epoch [3 / 10] Train: - accuracy: 0.76, loss: 8.26: : 49it [00:02, 23.95it/s]                     
Epoch [3 / 10]   Val: - accuracy: 0.76, loss: 8.03: : 13it [00:00, 30.78it/s]                     
Epoch [4 / 10] Train: - accuracy: 0.77, loss: 7.94: : 49it [00:02, 24.31it/s]                     
Epoch [4 / 10]   Val: - accuracy: 0.76, loss: 7.85: : 13it [00:00, 30.45it/s]                     
Epoch [5 / 10] Train: - accuracy: 0.74, loss 6.96:   8%|▊         | 4/48 [00:00<00:01, 24.66it/s]

Epoch     4: reducing learning rate of group 0 to 5.0000e-04.


Epoch [5 / 10] Train: - accuracy: 0.80, loss: 4.99: : 49it [00:02, 24.26it/s]                     
Epoch [5 / 10]   Val: - accuracy: 0.78, loss: 4.43: : 13it [00:00, 29.69it/s]                     
Epoch [6 / 10] Train: - accuracy: 0.81, loss: 4.47: : 49it [00:02, 24.41it/s]                     
Epoch [6 / 10]   Val: - accuracy: 0.78, loss: 4.52: : 13it [00:00, 30.82it/s]                     
Epoch [7 / 10] Train: - accuracy: 0.78, loss 4.09:   8%|▊         | 4/48 [00:00<00:01, 23.69it/s]

Epoch     6: reducing learning rate of group 0 to 2.5000e-04.


Epoch [7 / 10] Train: - accuracy: 0.84, loss: 2.92: : 49it [00:02, 24.41it/s]                     
Epoch [7 / 10]   Val: - accuracy: 0.78, loss: 2.54: : 13it [00:00, 30.45it/s]                     
Epoch [8 / 10] Train: - accuracy: 0.84, loss: 2.55: : 49it [00:02, 24.17it/s]                     
Epoch [8 / 10]   Val: - accuracy: 0.79, loss: 2.69: : 13it [00:00, 31.04it/s]                     
Epoch [9 / 10] Train: - accuracy: 0.85, loss: 2.61: : 49it [00:02, 24.09it/s]                     
Epoch [9 / 10]   Val: - accuracy: 0.77, loss: 2.67: : 13it [00:00, 30.31it/s]                     
Epoch [10 / 10] Train: - accuracy: 0.85, loss: 2.61: : 49it [00:02, 23.97it/s]                     
Epoch [10 / 10]   Val: - accuracy: 0.78, loss: 2.74: : 13it [00:00, 30.53it/s]                     

Epoch    10: reducing learning rate of group 0 to 1.2500e-04.





In [201]:
def submission(vectorizer):
    test_data = pd.read_csv("data/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    model.eval()
    submis = pd.DataFrame()
    submis['id'] = test_data['id']
    for ind in range(test.shape[0]):
        X = FloatTensor(test[ind].toarray())
        predict = model(X)
        true_indices = torch.argmax(predict, dim=1).detach().cpu().numpy()
        submis.loc[ind, 'target'] = true_indices
    submis['target'] = submis['target'].astype('int')
    submis.to_csv("data/submission_test.csv", index=False)
    
submission(vectorizer)