In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from tqdm import tqdm
from collections import Counter
from textblob import TextBlob

from nltk.stem import WordNetLemmatizer 
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import sklearn
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score


import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False

In [None]:
try:
    from google.colab import drive
    is_in_colab = True
    nltk.download('stopwords')
    nltk.download('wordnet')
except:
    is_in_colab = False

if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/Real-or-Not/data/'
else:
    data_folder = r'./data/'

In [None]:
stop_words = set(stopwords.words('english'))
data = pd.read_csv(data_folder + '/train.csv')

In [None]:
data.info()

In [None]:
data.columns

# Preprocessing functions

In [None]:
def get_hashtag_column(dataframe):
    hashtags = []
    for text in dataframe.text:
        result = re.findall('#\w+', text)
        if result != []:
            result = [w[1:].lower() for w in result]
            hashtags.append(' '.join(result))
    return hashtags

In [None]:
def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    result = []
    for t in texts:
        lemmatized_words = []
        t = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  'url', t)
        #t = re.sub('\!+', '', t)
        #t = re.sub('\?+', '', t)
        #t = re.sub('\d+[\:|\.]?\d*\s')
#         t = re.sub('\d+', '', t)
        tokens = re.findall('''\d+,\d+|\w+'\w+|#?\w+-?\w+|\w+\*+\w+''', t)
        
        if tokens == []:
            print(t)
        
        for token in tokens:
            if token.lower() not in stop_words:
                lemmatized_words.append(lemmatizer.lemmatize(token).lower())
        result.append(' '.join(lemmatized_words).replace('#', ''))
    return result

In [None]:
def tokenizer(text):
    return text.split(' ')

In [None]:
features = data[['keyword']].fillna('-9999')
features['location'] = data['location'].fillna('-9999')

In [None]:
features['text'] = pd.Series(lemmatize_texts(data.text), name='text')
all_lemmatized_tokens = [w for t in features['text'] for w in t.split(' ')]
print('Total words: ', len(all_lemmatized_tokens))
print('Unique_words: ', len(set(all_lemmatized_tokens)))

In [None]:
# Most common words in dataset
freq = nltk.probability.FreqDist(all_lemmatized_tokens)
# freq.most_common(20)

In [None]:
# Most common words in real tweets
real_tweets = data[data.target == 1].text
real_tweets = lemmatize_texts(real_tweets)
freq_real = nltk.probability.FreqDist([w for t in real_tweets for w in t.split(' ')])
# freq_real.most_common(10)

In [None]:
# Most common words in fake tweets
fake_tweets = data[data.target == 0].text
fake_tweets = lemmatize_texts(fake_tweets)
freq_fake = nltk.probability.FreqDist([w for t in fake_tweets for w in t.split(' ')])
# freq_fake.most_common(10)

In [None]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(features, data.target, 
                                                            test_size=0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size = 0.8, random_state=42)

In [None]:
X_train.shape

In [None]:
#'training' (tf-)idf vectorizer.
tf_idf = TfidfVectorizer(stop_words=stop_words,
                         smooth_idf=False)
tf_idf.fit(X_train['text'] + X_train['keyword'] + X_train['location'])
#getting idfs
idfs = tf_idf.idf_
#sorting out too rare and too common words
lower_thresh = 5.
upper_thresh = 8.
mask = (idfs < lower_thresh) | (idfs > upper_thresh)

bad_words = np.array(tf_idf.get_feature_names())[mask]

In [None]:
sns.set_style('darkgrid')
f, ax = plt.subplots(ncols=2, figsize=(15, 5))
sns.distplot(idfs, ax=ax[0]); 
sns.boxplot(idfs, ax=ax[1]);

In [None]:
# sns.set_style('darkgrid')
f, ax = plt.subplots(ncols=2, figsize=(15, 5))
sns.distplot(idfs[~mask], ax=ax[0]); 
sns.boxplot(idfs[~mask], ax=ax[1]);

In [None]:
bad_words.shape

In [None]:
# tf-idf with remove stop-words and bad_words
tf = TfidfVectorizer(stop_words=stop_words.union(bad_words),
                         smooth_idf=False, ngram_range=(1, 1), 
                         tokenizer=tokenizer)

In [None]:
# Vectorize texts
vectorizer = CountVectorizer(ngram_range=(1,2), tokenizer=tokenizer)
# tf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 1), tokenizer=tokenizer)
train_tf = tf.fit_transform(X_train['text'] + X_train['keyword'] + X_train['location'] )
val_tf = tf.transform(X_val['text'] + X_val['keyword'] + X_val['location'])
test_tf = tf.transform(X_test['text'] + X_test['keyword'] + X_test['location'])
train = vectorizer.fit_transform(X_train)
val = vectorizer.transform(X_val)
test = vectorizer.transform(X_test)

# Classic models

## LinearSVC

In [None]:
svc = LinearSVC(random_state=42, C=1, penalty='l2', dual=False, max_iter=1000)
svc.fit(train_tf, y_train)
svc.score(val_tf, y_val)

## RandomForestClassifier

In [None]:
forest = RandomForestClassifier(random_state=42, 
                                n_estimators=500, 
                                min_samples_leaf=1, 
                                max_depth=500,
                                oob_score=True)

forest.fit(train_tf, y_train)
print(forest.oob_score_)

In [None]:
forest.score(val_tf, y_val)

In [None]:
importance = sorted(zip(tf.get_feature_names(), forest.feature_importances_), key=lambda x: x[1], reverse=True)
for imp in importance[:20]: print("Feature '{}', importance={}".format(*imp))

## AdaBoostClassifier

In [None]:
boost = AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=42,
                           n_estimators=2000, learning_rate=1)
boost.fit(train_tf, y_train)

In [None]:
boost.score(val_tf, y_val)

## BaggingClassifier

In [None]:
bagging = BaggingClassifier(base_estimator = LogisticRegression(), random_state=42,  
                            max_features=0.7, n_jobs=-1, 
                            max_samples=1.0, n_estimators=2000)
bagging.fit(train_tf, y_train)

In [None]:
bagging.score(val_tf, y_val)

# Neural Network

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor

In [None]:
def data_loader(data, batch_size, shuffle=False):
    features = data[0]
    target = data[1]
    n_samples = features.shape[0]
    
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_indices = indices[start : end]
        X_batch = features[batch_indices].toarray()
        y_batch = target.values[batch_indices]
        yield X_batch, y_batch

In [None]:
def data_loader(data, batch_size, shuffle=False):
    features = data[0]
    target = data[1]
    n_samples = features.shape[0]
    
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_indices = indices[start : end]
        X_batch = features[batch_indices].toarray()
        y_batch = target.values[batch_indices]
        yield X_batch, y_batch

In [None]:
def data_loader(data, batch_size, shuffle=False):
    features = data[0]
    target = data[1]
    n_samples = features.shape[0]
    
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_indices = indices[start : end]
        X_batch = features[batch_indices].toarray()
        y_batch = target.values[batch_indices]
        yield X_batch, y_batch

In [None]:
def fit(model, loss_function, train_data=None, val_data=None, optimizer=None,
        epoch_count=1, batch_size=1, scheduler=None, alpha=1):
    train_history = []
    val_history = []
    best_model = None
    for epoch in range(epoch_count):
            name_prefix = '[{} / {}] '.format(epoch + 1, epoch_count)
            epoch_train_score = 0
            epoch_val_score = 0
            
            if train_data:
                epoch_train_score = do_epoch(model, loss_function, train_data, batch_size, 
                                              optimizer, name_prefix + 'Train:', alpha=alpha
                                            )
                train_history.append(epoch_train_score)

            if val_data:
                name = '  Val:'
                if not train_data:
                    name = ' Test:'
                epoch_val_score = do_epoch(model, loss_function, val_data, batch_size, 
                                             optimizer=None, name=name_prefix + name, alpha=alpha
                                          )
                
                val_history.append(epoch_val_score)
                if scheduler:
                    scheduler.step(epoch_val_score)
            elif scheduler:
                scheduler.step(epoch_train_score)

    return train_history, val_history
    

In [None]:
def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None, alpha=1):
    """
       Генерация одной эпохи
    """
    accuracy = 0
    epoch_loss = 0
   
    batch_count = int(data[0].shape[0] / batch_size)
   
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batch_count) as progress_bar:               
            for ind, (X, y) in enumerate(data_loader(data, batch_size)):
                X_batch, y_batch = FloatTensor(X).to(device), LongTensor(y).to(device)
                
                prediction = model(X_batch)
                
                loss = loss_function(prediction, y_batch)
                
                for param in model.children():
                    if type(param) == nn.Linear:
                        loss += alpha * torch.abs(param.weight).sum()
                        
                epoch_loss += loss.item()
                
                true_indices = torch.argmax(prediction, dim=1)
                correct_samples = torch.sum(true_indices == y_batch).cpu().numpy()
                accuracy += correct_samples / y_batch.shape[0]
                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('Epoch {} - accuracy: {:.2f}, loss {:.2f}'.format(
                    name, (accuracy / (ind+1)), epoch_loss / (ind+1))
                )
            
            accuracy /= (ind + 1)
            epoch_loss /= (ind + 1) 
            progress_bar.set_description(f'Epoch {name} - accuracy: {accuracy:.2f}, loss: {epoch_loss:.2f}')

    return accuracy

## LinearNN

In [None]:
class NNModel():
    def __init__(self, model):
        self.model = model
        
    def predict(self, inputs):
        self.model.eval()
        output = pd.DataFrame()
        for ind in range(inputs.shape[0]):
            X = FloatTensor(inputs[ind].toarray())
            predict = self.model(X)
            true_indices = torch.argmax(predict, dim=1).detach().cpu().numpy()
            output.loc[ind, 'target'] = true_indices
        return output.values

In [None]:
# fit settings
batch_size = 100
epoch_count = 10

# optim settings
learning_rate = 1e-3
weight_decay = 0
alpha = 0.005

# model settings
linear1_out = int(train_tf.shape[1]**0.5)
output = 2
dropout = 0.3

# scheduler settings
factor = 0.5
patience = 3
threshold = 1e-2

model = nn.Sequential(nn.Linear(train_tf.shape[1], linear1_out),
                      nn.BatchNorm1d(linear1_out),
#                       nn.Dropout(p=dropout, inplace=True),
                      nn.ReLU(inplace=True),
                      nn.Linear(linear1_out, output),
                      nn.ReLU(inplace=True)
                     ).to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

In [None]:
optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )

# Mixed models

## Create ensemble

In [None]:
models = []
models.append(bagging)
models.append(forest)
models.append(boost)
models.append(svc)
models.append(NNModel(model))

In [None]:
def ensemble(models, data):
    predicts = pd.DataFrame()
    for i, model in enumerate(models):
        predicts[i] = model.predict(data)
    result = predicts.apply(lambda row: row.value_counts().index[0], axis=1)
    return result.values

In [None]:
accuracy_score(ensemble(models, val_tf), y_val)

In [None]:
accuracy_score(ensemble(models, test_tf), y_test)

# Submission

In [None]:
# choose your best model
final_model = NNModel(model)

In [None]:
def submission(model, vectorizer, file_name="submission.csv"):
    test_data = pd.read_csv(data_folder + "/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    submit = pd.DataFrame()
    submit['id'] = test_data['id']
    submit['target'] = final_model.predict(test)
    submit['target'] = submit['target'].astype('int')
    submit.to_csv(data_folder + file_name, index=False)
    
submission(final_model, vectorizer)